Unverified Commit 66d6be08 authored by Binyao Jiang's avatar Binyao Jiang Committed by GitHub
Browse files

Bug fix: use correct mm_items in embed_mm_inputs (#8893)

parent 1c1f8a11
......@@ -560,7 +560,7 @@ def embed_mm_inputs(
]
items_size[i + 1] = len(mm_items)
items_offsets.append(
flatten_nested_list([item.offsets for item in mm_inputs.mm_items])
flatten_nested_list([item.offsets for item in mm_items])
)
items_size = torch.cumsum(items_size, dim=0).tolist()
......
......@@ -189,6 +189,9 @@ class TestGemma3nServer(TestOpenAIVisionServer):
# This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
# self._test_audio_ambient_completion()
def _test_mixed_image_audio_chat_completion(self):
self._test_mixed_image_audio_chat_completion()
class TestQwen2AudioServer(TestOpenAIVisionServer):
@classmethod
......
......@@ -213,6 +213,64 @@ class TestOpenAIVisionServer(CustomTestCase):
assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0
def _test_mixed_image_audio_chat_completion(self):
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
response = client.chat.completions.create(
model="default",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": IMAGE_MAN_IRONING_URL},
},
{
"type": "audio_url",
"audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
},
{
"type": "text",
"text": "Please describe the image in one sentence, and then write down the audio transcription in English.",
},
],
},
],
temperature=0,
**(self.get_vision_request_kwargs()),
)
assert response.choices[0].message.role == "assistant"
text = response.choices[0].message.content
assert isinstance(text, str)
print("-" * 30)
print(f"Mixed image & audio response:\n{text}")
print("-" * 30)
assert (
"man" in text
or "cab" in text
or "SUV" in text
or "taxi" in text
or "car" in text
), f"text: {text}, should contain man, cab, SUV, taxi or car"
check_list = [
"thank you",
"it's a privilege to be here",
"leader",
"science",
"art",
]
for check_word in check_list:
assert (
check_word in text
), f"text: |{text}| should contain |{check_word}|"
assert response.id
assert response.created
assert response.usage.prompt_tokens > 0
assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0
def prepare_video_images_messages(self, video_path):
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
# the size of the video embeds differs from the `modality` argument when preprocessed
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment