"examples/mxnet/sampling/vscode:/vscode.git/clone" did not exist on "08d4900fd10d8a24af5624ea020b1a52bc89409f"
Unverified Commit 66d6be08 authored by Binyao Jiang's avatar Binyao Jiang Committed by GitHub
Browse files

Bug fix: use correct mm_items in embed_mm_inputs (#8893)

parent 1c1f8a11
...@@ -560,7 +560,7 @@ def embed_mm_inputs( ...@@ -560,7 +560,7 @@ def embed_mm_inputs(
] ]
items_size[i + 1] = len(mm_items) items_size[i + 1] = len(mm_items)
items_offsets.append( items_offsets.append(
flatten_nested_list([item.offsets for item in mm_inputs.mm_items]) flatten_nested_list([item.offsets for item in mm_items])
) )
items_size = torch.cumsum(items_size, dim=0).tolist() items_size = torch.cumsum(items_size, dim=0).tolist()
......
...@@ -189,6 +189,9 @@ class TestGemma3nServer(TestOpenAIVisionServer): ...@@ -189,6 +189,9 @@ class TestGemma3nServer(TestOpenAIVisionServer):
# This _test_audio_ambient_completion test is way too complicated to pass for a small LLM # This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
# self._test_audio_ambient_completion() # self._test_audio_ambient_completion()
def _test_mixed_image_audio_chat_completion(self):
self._test_mixed_image_audio_chat_completion()
class TestQwen2AudioServer(TestOpenAIVisionServer): class TestQwen2AudioServer(TestOpenAIVisionServer):
@classmethod @classmethod
......
...@@ -213,6 +213,64 @@ class TestOpenAIVisionServer(CustomTestCase): ...@@ -213,6 +213,64 @@ class TestOpenAIVisionServer(CustomTestCase):
assert response.usage.completion_tokens > 0 assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0 assert response.usage.total_tokens > 0
def _test_mixed_image_audio_chat_completion(self):
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
response = client.chat.completions.create(
model="default",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": IMAGE_MAN_IRONING_URL},
},
{
"type": "audio_url",
"audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
},
{
"type": "text",
"text": "Please describe the image in one sentence, and then write down the audio transcription in English.",
},
],
},
],
temperature=0,
**(self.get_vision_request_kwargs()),
)
assert response.choices[0].message.role == "assistant"
text = response.choices[0].message.content
assert isinstance(text, str)
print("-" * 30)
print(f"Mixed image & audio response:\n{text}")
print("-" * 30)
assert (
"man" in text
or "cab" in text
or "SUV" in text
or "taxi" in text
or "car" in text
), f"text: {text}, should contain man, cab, SUV, taxi or car"
check_list = [
"thank you",
"it's a privilege to be here",
"leader",
"science",
"art",
]
for check_word in check_list:
assert (
check_word in text
), f"text: |{text}| should contain |{check_word}|"
assert response.id
assert response.created
assert response.usage.prompt_tokens > 0
assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0
def prepare_video_images_messages(self, video_path): def prepare_video_images_messages(self, video_path):
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa # the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
# the size of the video embeds differs from the `modality` argument when preprocessed # the size of the video embeds differs from the `modality` argument when preprocessed
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment