Unverified Commit 57e9bf18 authored by Nicolò Lucchesi's avatar Nicolò Lucchesi Committed by GitHub
Browse files

[CI] Whisper logprobs tests (#30504)


Signed-off-by: default avatarNickLucche <nlucches@redhat.com>
parent 2f32a68d
...@@ -702,10 +702,16 @@ class HfRunner: ...@@ -702,10 +702,16 @@ class HfRunner:
**kwargs, **kwargs,
) )
# Encoder-decoder models return decoder_hidden_states instead of
# hidden_states
hidden_states = (
getattr(output, "hidden_states", None) or output.decoder_hidden_states
)
( (
seq_logprobs_lst, seq_logprobs_lst,
output_len, output_len,
) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs) ) = self._hidden_states_to_logprobs(hidden_states, num_logprobs)
all_logprobs.append(seq_logprobs_lst) all_logprobs.append(seq_logprobs_lst)
seq_ids = output.sequences[0] seq_ids = output.sequences[0]
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from typing import Any
import librosa
import pytest import pytest
from transformers import AutoModelForSpeechSeq2Seq
from vllm import SamplingParams
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.platforms import current_platform
from ....conftest import VllmRunner from ....conftest import HfRunner, PromptAudioInput, VllmRunner
from ....utils import create_new_process_for_each_test, multi_gpu_test from ....utils import create_new_process_for_each_test, multi_gpu_test
from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
HF_PROMPT = ""
# Whisper expects 16kHz audio
WHISPER_SAMPLE_RATE = 16000
PROMPTS = [
{ @pytest.fixture(autouse=True)
"prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", def use_spawn_for_whisper(monkeypatch):
"multi_modal_data": { """Whisper has issues with forked workers, use spawn instead."""
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
},
},
{ # Test explicit encoder/decoder prompt
"encoder_prompt": {
"prompt": "",
"multi_modal_data": {
"audio": AudioAsset("winning_call").audio_and_sample_rate,
},
},
"decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
},
]
EXPECTED = {
"openai/whisper-tiny": [
" He has birth words I spoke in the original corner of that. And a"
" little piece of black coat poetry. Mary had a little sandwich,"
" sweet, with white and snow. And everyone had it very went the last"
" would sure to go.",
" >> And the old one, fit John the way to Edgar Martinez. >> One more"
" to line down the field line for our base camp. Here comes joy. Here"
" is June and the third base. They're going to wave him in. The throw"
" to the plate will be late. The Mariners are going to play for the"
" American League Championship. I don't believe it. It just continues"
" by all five.",
],
"openai/whisper-small": [
" The first words I spoke in the original pornograph. A little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite a"
" slow, and everywhere that Mary went the lamb was sure to go.",
" And the old one pitch on the way to Edgar Martinez one month. Here"
" comes joy. Here is Junior to third base. They're gonna wave him"
" in. The throw to the plate will be late. The Mariners are going to"
" play for the American League Championship. I don't believe it. It"
" just continues. My, oh my.",
],
"openai/whisper-medium": [
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite as"
" slow, and everywhere that Mary went the lamb was sure to go.",
" And the 0-1 pitch on the way to Edgar Martinez swung on the line"
" down the left field line for Obeyshev. Here comes Joy. Here is"
" Jorgen at third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh"
" my.",
],
"openai/whisper-large-v3": [
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its feet were quite as"
" slow, and everywhere that Mary went, the lamb was sure to go.",
" And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
" Now the left field line for a base hit. Here comes Joy. Here is"
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my.",
],
"openai/whisper-large-v3-turbo": [
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its streets were quite"
" as slow, and everywhere that Mary went the lamb was sure to go.",
" And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
" down the left field line for a base hit. Here comes Joy. Here is"
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my.",
],
}
def run_test( def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
model: str, model: str,
*, *,
max_model_len: int,
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int, tensor_parallel_size: int,
distributed_executor_backend: str | None = None, distributed_executor_backend: str | None = None,
dtype: str = "half", enforce_eager: bool = True,
) -> None: ) -> None:
prompt_list = PROMPTS * 10 """Inference result should be the same between hf and vllm.
expected_list = EXPECTED[model] * 10
All the audio fixtures for the test are from AudioAsset.
For huggingface runner, we provide the audio as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
"""
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
max_model_len=448, max_model_len=max_model_len,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
# TODO (NickLucche) figure out output differences with non-eager and re-enable limit_mm_per_prompt={"audio": 2},
enforce_eager=True, enforce_eager=enforce_eager,
disable_custom_all_reduce=True,
) as vllm_model: ) as vllm_model:
llm = vllm_model.llm vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
sampling_params = SamplingParams( vllm_prompts,
temperature=0, max_tokens,
top_p=1.0, num_logprobs=num_logprobs,
max_tokens=200, audios=audios,
)
for vllm_prompts, _, audios in inputs
]
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
hf_prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
)
for _, hf_prompts, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
) )
outputs = llm.generate(prompt_list, sampling_params)
for output, expected in zip(outputs, expected_list): @pytest.fixture
print(output.outputs[0].text) def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
assert output.outputs[0].text == expected audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
inputs = []
for asset in audio_assets:
audio, orig_sr = asset.audio_and_sample_rate
# Resample to Whisper's expected sample rate (16kHz)
if orig_sr != WHISPER_SAMPLE_RATE:
audio = librosa.resample(
audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
)
# vLLM prompts, HF prompts, audio inputs
inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
return inputs
@pytest.mark.core_model def check_model_available(model: str) -> None:
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
@pytest.mark.parametrize("dtype", ["half"]) model_info.check_available_online(on_fail="skip")
@create_new_process_for_each_test() model_info.check_transformers_version(on_fail="skip")
def test_models(vllm_runner, model, dtype) -> None:
run_test(
vllm_runner,
model,
tensor_parallel_size=1,
dtype=dtype,
)
@pytest.mark.core_model
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_models_cpu(vllm_runner, model, dtype) -> None: @pytest.mark.parametrize("num_logprobs", [5])
# @create_new_process_for_each_test() does not work for some runners @pytest.mark.parametrize("enforce_eager", [True, False])
# TODO: to fix cpu privilege issues in run-cpu-test-arm.sh @create_new_process_for_each_test("spawn")
def test_models(
hf_runner,
vllm_runner,
model: str,
dtype: str,
num_logprobs: int,
input_audios,
enforce_eager: bool,
) -> None:
check_model_available(model)
if current_platform.is_cpu() and not enforce_eager:
pytest.skip("Skipping test for CPU with non-eager mode")
run_test( run_test(
hf_runner,
vllm_runner, vllm_runner,
input_audios,
model, model,
tensor_parallel_size=1,
dtype=dtype, dtype=dtype,
max_model_len=448,
max_tokens=200,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
enforce_eager=enforce_eager,
) )
...@@ -152,15 +148,31 @@ def test_models_cpu(vllm_runner, model, dtype) -> None: ...@@ -152,15 +148,31 @@ def test_models_cpu(vllm_runner, model, dtype) -> None:
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@create_new_process_for_each_test() @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [200])
@pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
def test_models_distributed( def test_models_distributed(
hf_runner,
vllm_runner, vllm_runner,
model, model: str,
distributed_executor_backend, distributed_executor_backend: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
input_audios,
) -> None: ) -> None:
check_model_available(model)
run_test( run_test(
hf_runner,
vllm_runner, vllm_runner,
input_audios,
model, model,
dtype=dtype,
max_model_len=448,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=2, tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enforce_eager=False,
) )
...@@ -840,7 +840,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -840,7 +840,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
is_available_online=False, is_available_online=False,
), ),
# [Encoder-decoder] # [Encoder-decoder]
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), "WhisperForConditionalGeneration": _HfExamplesInfo(
"openai/whisper-large-v3-turbo",
extras={"v3": "openai/whisper-large-v3"},
),
# [Cross-encoder] # [Cross-encoder]
"JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment