Unverified Commit 8ea5e44a authored by youkaichao's avatar youkaichao Committed by GitHub
Browse files

[CI/Test] improve robustness of test (vllm_runner) (#5357)

[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)
parent 9fb900f9
...@@ -46,12 +46,11 @@ def test_models( ...@@ -46,12 +46,11 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner(model, with vllm_runner(model,
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
......
...@@ -43,17 +43,16 @@ def test_models( ...@@ -43,17 +43,16 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
......
...@@ -46,17 +46,16 @@ def test_chunked_prefill_recompute( ...@@ -46,17 +46,16 @@ def test_chunked_prefill_recompute(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT) ARTIFICIAL_PREEMPTION_MAX_CNT)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
...@@ -84,17 +83,16 @@ def test_preemption( ...@@ -84,17 +83,16 @@ def test_preemption(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
) ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT) ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = ( total_preemption = (
vllm_model.model.llm_engine.scheduler.num_cumulative_preemption) vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
...@@ -139,19 +137,18 @@ def test_swap( ...@@ -139,19 +137,18 @@ def test_swap(
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
max_tokens) max_tokens)
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
swap_space=10, swap_space=10,
disable_log_stats=False, disable_log_stats=False,
) ) as vllm_model:
vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, vllm_outputs = vllm_model.generate_beam_search(example_prompts,
max_tokens) beam_width, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT) ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = ( total_preemption = (
vllm_model.model.llm_engine.scheduler.num_cumulative_preemption) vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, _ = hf_outputs[i] hf_output_ids, _ = hf_outputs[i]
...@@ -196,28 +193,28 @@ def test_swap_infeasible( ...@@ -196,28 +193,28 @@ def test_swap_infeasible(
decode_blocks = max_tokens // BLOCK_SIZE decode_blocks = max_tokens // BLOCK_SIZE
example_prompts = example_prompts[:1] example_prompts = example_prompts[:1]
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
swap_space=10, swap_space=10,
block_size=BLOCK_SIZE, block_size=BLOCK_SIZE,
# Since beam search have more than 1 sequence, prefill + decode blocks # Since beam search have more than 1 sequence, prefill +
# are not enough to finish. # decode blocks are not enough to finish.
num_gpu_blocks_override=prefill_blocks + decode_blocks, num_gpu_blocks_override=prefill_blocks + decode_blocks,
max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE, max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
) ) as vllm_model:
sampling_params = SamplingParams(n=beam_width, sampling_params = SamplingParams(n=beam_width,
use_beam_search=True, use_beam_search=True,
temperature=0.0, temperature=0.0,
max_tokens=max_tokens, max_tokens=max_tokens,
ignore_eos=True) ignore_eos=True)
req_outputs = vllm_model.model.generate( req_outputs = vllm_model.model.generate(
example_prompts, example_prompts,
sampling_params=sampling_params, sampling_params=sampling_params,
) )
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT) ARTIFICIAL_PREEMPTION_MAX_CNT)
del vllm_model
# Verify the request is ignored and not hang. # Verify the request is ignored and not hang.
assert req_outputs[0].outputs[0].finish_reason == "length" assert req_outputs[0].outputs[0].finish_reason == "length"
...@@ -236,25 +233,26 @@ def test_preemption_infeasible( ...@@ -236,25 +233,26 @@ def test_preemption_infeasible(
BLOCK_SIZE = 16 BLOCK_SIZE = 16
prefill_blocks = 2 prefill_blocks = 2
decode_blocks = max_tokens // BLOCK_SIZE decode_blocks = max_tokens // BLOCK_SIZE
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
block_size=BLOCK_SIZE, block_size=BLOCK_SIZE,
# Not enough gpu blocks to complete a single sequence. # Not enough gpu blocks to complete a single sequence.
# preemption should happen, and the sequence should be # preemption should happen, and the sequence should be
# ignored instead of hanging forever. # ignored instead of hanging forever.
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
) ) as vllm_model:
sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) sampling_params = SamplingParams(max_tokens=max_tokens,
req_outputs = vllm_model.model.generate( ignore_eos=True)
example_prompts, req_outputs = vllm_model.model.generate(
sampling_params=sampling_params, example_prompts,
) sampling_params=sampling_params,
)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
del vllm_model
# Verify the request is ignored and not hang. # Verify the request is ignored and not hang.
for req_output in req_outputs: for req_output in req_outputs:
outputs = req_output.outputs outputs = req_output.outputs
......
...@@ -493,7 +493,10 @@ class VllmRunner: ...@@ -493,7 +493,10 @@ class VllmRunner:
outputs.append(embedding) outputs.append(embedding)
return outputs return outputs
def __del__(self): def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
del self.model del self.model
cleanup() cleanup()
......
...@@ -45,14 +45,13 @@ def test_models( ...@@ -45,14 +45,13 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner( with vllm_runner(model,
model, dtype=dtype,
dtype=dtype, tensor_parallel_size=2,
tensor_parallel_size=2, enforce_eager=enforce_eager,
enforce_eager=enforce_eager, distributed_executor_backend=distributed_executor_backend
distributed_executor_backend=distributed_executor_backend) ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
......
...@@ -48,17 +48,16 @@ def test_models( ...@@ -48,17 +48,16 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
tensor_parallel_size=2, tensor_parallel_size=2,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
) ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
......
...@@ -19,9 +19,8 @@ MAX_TOKENS = 1024 ...@@ -19,9 +19,8 @@ MAX_TOKENS = 1024
@pytest.fixture @pytest.fixture
def vllm_model(vllm_runner): def vllm_model(vllm_runner):
vllm_model = vllm_runner(MODEL) with vllm_runner(MODEL) as vllm_model:
yield vllm_model yield vllm_model
del vllm_model
def test_stop_reason(vllm_model, example_prompts): def test_stop_reason(vllm_model, example_prompts):
......
...@@ -10,7 +10,8 @@ MAX_TOKENS = 200 ...@@ -10,7 +10,8 @@ MAX_TOKENS = 200
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def vllm_model(vllm_runner): def vllm_model(vllm_runner):
return vllm_runner(MODEL) with vllm_runner(MODEL) as vllm_model:
yield vllm_model
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
......
...@@ -23,23 +23,25 @@ def test_metric_counter_prompt_tokens( ...@@ -23,23 +23,25 @@ def test_metric_counter_prompt_tokens(
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
vllm_model = vllm_runner(model, with vllm_runner(model,
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.4) gpu_memory_utilization=0.4) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.model.get_tokenizer()
prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts] prompt_token_counts = [
# This test needs at least 2 prompts in a batch of different lengths to len(tokenizer.encode(p)) for p in example_prompts
# verify their token count is correct despite padding. ]
assert len(example_prompts) > 1, "at least 2 prompts are required" # This test needs at least 2 prompts in a batch of different lengths to
assert prompt_token_counts[0] != prompt_token_counts[1], ( # verify their token count is correct despite padding.
"prompts of different lengths are required") assert len(example_prompts) > 1, "at least 2 prompts are required"
vllm_prompt_token_count = sum(prompt_token_counts) assert prompt_token_counts[0] != prompt_token_counts[1], (
"prompts of different lengths are required")
_ = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_prompt_token_count = sum(prompt_token_counts)
stat_logger = vllm_model.model.llm_engine.stat_logger
metric_count = stat_logger.metrics.counter_prompt_tokens.labels( _ = vllm_model.generate_greedy(example_prompts, max_tokens)
**stat_logger.labels)._value.get() stat_logger = vllm_model.model.llm_engine.stat_logger
metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
**stat_logger.labels)._value.get()
assert vllm_prompt_token_count == metric_count, ( assert vllm_prompt_token_count == metric_count, (
f"prompt token count: {vllm_prompt_token_count!r}\n" f"prompt token count: {vllm_prompt_token_count!r}\n"
...@@ -56,22 +58,22 @@ def test_metric_counter_generation_tokens( ...@@ -56,22 +58,22 @@ def test_metric_counter_generation_tokens(
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
vllm_model = vllm_runner(model, with vllm_runner(model,
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.4) gpu_memory_utilization=0.4) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_logger stat_logger = vllm_model.model.llm_engine.stat_logger
metric_count = stat_logger.metrics.counter_generation_tokens.labels( metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get() **stat_logger.labels)._value.get()
vllm_generation_count = 0 vllm_generation_count = 0
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
vllm_output_ids, vllm_output_str = vllm_outputs[i] vllm_output_ids, vllm_output_str = vllm_outputs[i]
prompt_ids = tokenizer.encode(example_prompts[i]) prompt_ids = tokenizer.encode(example_prompts[i])
# vllm_output_ids contains both prompt tokens and generation tokens. # vllm_output_ids contains both prompt tokens and generation tokens.
# We're interested only in the count of the generation tokens. # We're interested only in the count of the generation tokens.
vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
assert vllm_generation_count == metric_count, ( assert vllm_generation_count == metric_count, (
f"generation token count: {vllm_generation_count!r}\n" f"generation token count: {vllm_generation_count!r}\n"
...@@ -85,15 +87,13 @@ def test_metric_counter_generation_tokens( ...@@ -85,15 +87,13 @@ def test_metric_counter_generation_tokens(
[None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]]) [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
served_model_name: List[str]) -> None: served_model_name: List[str]) -> None:
vllm_model = vllm_runner(model, with vllm_runner(model,
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.3, gpu_memory_utilization=0.3,
served_model_name=served_model_name) served_model_name=served_model_name) as vllm_model:
stat_logger = vllm_model.model.llm_engine.stat_logger stat_logger = vllm_model.model.llm_engine.stat_logger
metrics_tag_content = stat_logger.labels["model_name"] metrics_tag_content = stat_logger.labels["model_name"]
del vllm_model
if served_model_name is None or served_model_name == []: if served_model_name is None or served_model_name == []:
assert metrics_tag_content == model, ( assert metrics_tag_content == model, (
......
...@@ -82,10 +82,9 @@ def test_models( ...@@ -82,10 +82,9 @@ def test_models(
num_logprobs: int, num_logprobs: int,
) -> None: ) -> None:
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, vllm_outputs = vllm_model.generate_greedy_logprobs(
max_tokens, example_prompts, max_tokens, num_logprobs)
num_logprobs)
# loop through the prompts to compare against the ground truth generations # loop through the prompts to compare against the ground truth generations
for prompt_idx in range(len(example_prompts)): for prompt_idx in range(len(example_prompts)):
......
...@@ -37,9 +37,8 @@ def test_models( ...@@ -37,9 +37,8 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
...@@ -57,9 +56,8 @@ def test_model_print( ...@@ -57,9 +56,8 @@ def test_model_print(
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr # This test is for verifying whether the model's extra_repr
# can be printed correctly. # can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker. print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model) model_runner.model)
del vllm_model
...@@ -31,9 +31,8 @@ def test_models( ...@@ -31,9 +31,8 @@ def test_models(
with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model: with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
hf_outputs = hf_model.encode(example_prompts) hf_outputs = hf_model.encode(example_prompts)
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts) vllm_outputs = vllm_model.encode(example_prompts)
del vllm_model
similarities = compare_embeddings(hf_outputs, vllm_outputs) similarities = compare_embeddings(hf_outputs, vllm_outputs)
all_similarities = torch.stack(similarities) all_similarities = torch.stack(similarities)
......
...@@ -70,32 +70,29 @@ def test_models( ...@@ -70,32 +70,29 @@ def test_models(
model_name, revision = model model_name, revision = model
# Run marlin. # Run marlin.
gptq_marlin_model = vllm_runner(model_name=model_name, with vllm_runner(model_name=model_name,
revision=revision, revision=revision,
dtype=dtype, dtype=dtype,
quantization="marlin", quantization="marlin",
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) tensor_parallel_size=1) as gptq_marlin_model:
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs( gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs) example_prompts[:-1], max_tokens, num_logprobs)
del gptq_marlin_model
_ROPE_DICT.clear() # clear rope cache to avoid rope dtype error _ROPE_DICT.clear() # clear rope cache to avoid rope dtype error
# Run gptq. # Run gptq.
# The naive gptq kernel doesn't support bf16 yet. # The naive gptq kernel doesn't support bf16 yet.
# Here we always compare fp16/bf16 gpt marlin kernel # Here we always compare fp16/bf16 gpt marlin kernel
# to fp16 gptq kernel. # to fp16 gptq kernel.
gptq_model = vllm_runner(model_name=model_name, with vllm_runner(model_name=model_name,
revision=revision, revision=revision,
dtype="half", dtype="half",
quantization="gptq", quantization="gptq",
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) tensor_parallel_size=1) as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts[:-1], gptq_outputs = gptq_model.generate_greedy_logprobs(
max_tokens, example_prompts[:-1], max_tokens, num_logprobs)
num_logprobs)
del gptq_model
check_logprobs_close( check_logprobs_close(
outputs_0_lst=gptq_outputs, outputs_0_lst=gptq_outputs,
......
...@@ -61,20 +61,16 @@ def test_models( ...@@ -61,20 +61,16 @@ def test_models(
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
) -> None: ) -> None:
marlin_24_model = vllm_runner(model_pair.model_marlin, with vllm_runner(model_pair.model_marlin,
dtype=dtype, dtype=dtype,
quantization="gptq_marlin_24") quantization="gptq_marlin_24") as marlin_24_model:
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs( marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
del marlin_24_model
gptq_model = vllm_runner(model_pair.model_gptq, with vllm_runner(model_pair.model_gptq, dtype=dtype,
dtype=dtype, quantization="gptq") as gptq_model:
quantization="gptq") gptq_outputs = gptq_model.generate_greedy_logprobs(
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts, example_prompts, max_tokens, num_logprobs)
max_tokens,
num_logprobs)
del gptq_model
check_logprobs_close( check_logprobs_close(
outputs_0_lst=gptq_outputs, outputs_0_lst=gptq_outputs,
......
...@@ -94,14 +94,13 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, ...@@ -94,14 +94,13 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
for p in HF_IMAGE_PROMPTS for p in HF_IMAGE_PROMPTS
] ]
vllm_model = vllm_runner(model_id, with vllm_runner(model_id,
dtype=dtype, dtype=dtype,
enforce_eager=True, enforce_eager=True,
**vlm_config.as_cli_args_dict()) **vlm_config.as_cli_args_dict()) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
max_tokens, max_tokens,
images=vllm_images) images=vllm_images)
del vllm_model
for i in range(len(HF_IMAGE_PROMPTS)): for i in range(len(HF_IMAGE_PROMPTS)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
......
...@@ -59,20 +59,16 @@ def test_models( ...@@ -59,20 +59,16 @@ def test_models(
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
) -> None: ) -> None:
marlin_model = vllm_runner(model_pair.model_marlin, with vllm_runner(model_pair.model_marlin,
dtype=dtype, dtype=dtype,
quantization="marlin") quantization="marlin") as marlin_model:
marlin_outputs = marlin_model.generate_greedy_logprobs( marlin_outputs = marlin_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
del marlin_model
with vllm_runner(model_pair.model_gptq, dtype=dtype,
gptq_model = vllm_runner(model_pair.model_gptq, quantization="gptq") as gptq_model:
dtype=dtype, gptq_outputs = gptq_model.generate_greedy_logprobs(
quantization="gptq") example_prompts, max_tokens, num_logprobs)
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
max_tokens,
num_logprobs)
del gptq_model
check_logprobs_close( check_logprobs_close(
outputs_0_lst=gptq_outputs, outputs_0_lst=gptq_outputs,
......
...@@ -30,11 +30,9 @@ def test_models( ...@@ -30,11 +30,9 @@ def test_models(
hf_outputs = hf_model.generate_greedy_logprobs_limit( hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, vllm_outputs = vllm_model.generate_greedy_logprobs(
max_tokens, example_prompts, max_tokens, num_logprobs)
num_logprobs)
del vllm_model
check_logprobs_close( check_logprobs_close(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
......
...@@ -37,9 +37,8 @@ def test_models( ...@@ -37,9 +37,8 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
...@@ -57,9 +56,8 @@ def test_model_print( ...@@ -57,9 +56,8 @@ def test_model_print(
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr # This test is for verifying whether the model's extra_repr
# can be printed correctly. # can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker. print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model) model_runner.model)
del vllm_model
...@@ -16,65 +16,65 @@ capability = capability[0] * 10 + capability[1] ...@@ -16,65 +16,65 @@ capability = capability[0] * 10 + capability[1]
capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(), capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
reason='bitsandbytes is not supported on this GPU type.') reason='bitsandbytes is not supported on this GPU type.')
def test_load_bnb_model(vllm_runner) -> None: def test_load_bnb_model(vllm_runner) -> None:
llm = vllm_runner('huggyllama/llama-7b', with vllm_runner('huggyllama/llama-7b',
quantization='bitsandbytes', quantization='bitsandbytes',
load_format='bitsandbytes', load_format='bitsandbytes',
enforce_eager=True) enforce_eager=True) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
# check the weights in MLP & SelfAttention are quantized to torch.uint8 # check the weights in MLP & SelfAttention are quantized to torch.uint8
qweight = model.model.layers[0].mlp.gate_up_proj.qweight qweight = model.model.layers[0].mlp.gate_up_proj.qweight
assert qweight.dtype == torch.uint8, ( assert qweight.dtype == torch.uint8, (
f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}') f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}')
qweight = model.model.layers[0].mlp.down_proj.qweight qweight = model.model.layers[0].mlp.down_proj.qweight
assert qweight.dtype == torch.uint8, ( assert qweight.dtype == torch.uint8, (
f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}') f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}')
qweight = model.model.layers[0].self_attn.o_proj.qweight qweight = model.model.layers[0].self_attn.o_proj.qweight
assert qweight.dtype == torch.uint8, ( assert qweight.dtype == torch.uint8, (
f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}') f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}')
qweight = model.model.layers[0].self_attn.qkv_proj.qweight qweight = model.model.layers[0].self_attn.qkv_proj.qweight
assert qweight.dtype == torch.uint8, ( assert qweight.dtype == torch.uint8, (
f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}') f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}')
# some weights should not be quantized # some weights should not be quantized
weight = model.lm_head.weight weight = model.lm_head.weight
assert weight.dtype != torch.uint8, ( assert weight.dtype != torch.uint8, (
'lm_head weight dtype should not be torch.uint8') 'lm_head weight dtype should not be torch.uint8')
weight = model.model.embed_tokens.weight weight = model.model.embed_tokens.weight
assert weight.dtype != torch.uint8, ( assert weight.dtype != torch.uint8, (
'embed_tokens weight dtype should not be torch.uint8') 'embed_tokens weight dtype should not be torch.uint8')
weight = model.model.layers[0].input_layernorm.weight weight = model.model.layers[0].input_layernorm.weight
assert weight.dtype != torch.uint8, ( assert weight.dtype != torch.uint8, (
'input_layernorm weight dtype should not be torch.uint8') 'input_layernorm weight dtype should not be torch.uint8')
weight = model.model.layers[0].post_attention_layernorm.weight weight = model.model.layers[0].post_attention_layernorm.weight
assert weight.dtype != torch.uint8, ( assert weight.dtype != torch.uint8, (
'input_layernorm weight dtype should not be torch.uint8') 'input_layernorm weight dtype should not be torch.uint8')
# check the output of the model is expected # check the output of the model is expected
sampling_params = SamplingParams(temperature=0.0, sampling_params = SamplingParams(temperature=0.0,
logprobs=1, logprobs=1,
prompt_logprobs=1, prompt_logprobs=1,
max_tokens=8) max_tokens=8)
prompts = ['That which does not kill us', 'To be or not to be,'] prompts = ['That which does not kill us', 'To be or not to be,']
expected_outputs = [ expected_outputs = [
'That which does not kill us makes us stronger.', 'That which does not kill us makes us stronger.',
'To be or not to be, that is the question.' 'To be or not to be, that is the question.'
] ]
outputs = llm.generate(prompts, sampling_params=sampling_params) outputs = llm.generate(prompts, sampling_params=sampling_params)
assert len(outputs) == len(prompts) assert len(outputs) == len(prompts)
for index in range(len(outputs)): for index in range(len(outputs)):
# compare the first line of the output # compare the first line of the output
actual_output = outputs[index][1][0].split('\n', 1)[0] actual_output = outputs[index][1][0].split('\n', 1)[0]
expected_output = expected_outputs[index].split('\n', 1)[0] expected_output = expected_outputs[index].split('\n', 1)[0]
assert actual_output == expected_output, ( assert actual_output == expected_output, (
f'Expected: {expected_output}, but got: {actual_output}') f'Expected: {expected_output}, but got: {actual_output}')
...@@ -12,42 +12,45 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso ...@@ -12,42 +12,45 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
def test_compressed_tensors_w8a8_static_setup(vllm_runner): def test_compressed_tensors_w8a8_static_setup(vllm_runner):
model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed" model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed"
llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True) with vllm_runner(model_path, quantization="sparseml",
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model enforce_eager=True) as llm:
layer = model.model.layers[0] model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj qkv_proj = layer.self_attn.qkv_proj
o_proj = layer.self_attn.o_proj o_proj = layer.self_attn.o_proj
gate_up_proj = layer.mlp.gate_up_proj gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj down_proj = layer.mlp.down_proj
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(gate_up_proj.quant_method,
assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod) CompressedTensorsLinearMethod)
assert isinstance(down_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor) assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
assert qkv_proj.weight.dtype is torch.int8 assert qkv_proj.weight.dtype is torch.int8
assert o_proj.weight.dtype is torch.int8 assert o_proj.weight.dtype is torch.int8
assert gate_up_proj.weight.dtype is torch.int8 assert gate_up_proj.weight.dtype is torch.int8
assert qkv_proj.weight_scale.shard_splitter is not None assert qkv_proj.weight_scale.shard_splitter is not None
assert qkv_proj.weight_scale.logical_widths is not None assert qkv_proj.weight_scale.logical_widths is not None
assert qkv_proj.input_scale.dtype is torch.float32 assert qkv_proj.input_scale.dtype is torch.float32
def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner): def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
model_path = "nm-testing/tinyllama-one-shot-dynamic-test" model_path = "nm-testing/tinyllama-one-shot-dynamic-test"
llm = vllm_runner(model_path, with vllm_runner(model_path,
quantization="sparseml", quantization="sparseml",
enforce_eager=True, enforce_eager=True,
dtype=torch.float16) dtype=torch.float16) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0] layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken) assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
assert qkv_proj.weight.dtype is torch.int8 assert qkv_proj.weight.dtype is torch.int8
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment