test_gemma3.py 7.04 KB
Newer Older
1
2
3
4
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest
5
import torch
6

7
8
9
from vllm.model_executor.models.gemma3n_audio_utils import (
    adjust_audio_features_to_expected_length,
)
10
11
12
13
14
from vllm.multimodal import MULTIMODAL_REGISTRY

from ....conftest import ImageTestAssets
from ...utils import build_model_context

15
16
# Gemma3 (image) model
GEMMA3_MODEL_ID = "google/gemma-3-4b-it"
17

18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# Gemma3n (multimodal with audio) model
GEMMA3N_MODEL_ID = "google/gemma-3n-E2B-it"

# Expected audio tokens for Gemma3n (audio_soft_tokens_per_image)
GEMMA3N_EXPECTED_AUDIO_TOKENS = 188


class TestGemma3nAudioTensorLogic:
    """CPU-based tests for Gemma3n audio feature tensor manipulation.

    These tests validate the padding/truncation logic in
    adjust_audio_features_to_expected_length() which fixes the
    integer overflow in _process_audio_input when audio_seq_len > 188.
    """

    def test_padding_when_audio_short(self):
        """Test that short audio is padded to expected length."""
        batch_size, seq_len, embed_dim = 1, 100, 256
        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS

        audio_features = torch.randn(batch_size, seq_len, embed_dim)
        padding_embs = torch.zeros(1, 1, embed_dim)

        result, tokens_truncated = adjust_audio_features_to_expected_length(
            audio_features, expected_tokens, padding_embs
        )

        assert result.shape == (batch_size, expected_tokens, embed_dim)
        assert tokens_truncated == 0
        # First 100 tokens should be original, rest should be padding (zeros)
        assert torch.allclose(result[:, :seq_len, :], audio_features)
        assert torch.allclose(
            result[:, seq_len:, :],
            torch.zeros(batch_size, expected_tokens - seq_len, embed_dim),
        )

    def test_truncation_when_audio_long(self):
        """Test that long audio is truncated to expected length.

        This is the key test for the overflow fix. Previously, when
        audio_seq_len > expected_tokens, the code would compute a negative
        padding value causing: RuntimeError: numel: integer multiplication overflow
        """
        batch_size, seq_len, embed_dim = 1, 192, 256  # 192 > 188
        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS

        audio_features = torch.randn(batch_size, seq_len, embed_dim)
        padding_embs = torch.zeros(1, 1, embed_dim)

        result, tokens_truncated = adjust_audio_features_to_expected_length(
            audio_features, expected_tokens, padding_embs
        )

        assert result.shape == (batch_size, expected_tokens, embed_dim)
        assert tokens_truncated == seq_len - expected_tokens  # 192 - 188 = 4
        # Result should be first 188 tokens of original
        assert torch.allclose(result, audio_features[:, :expected_tokens, :])

    def test_no_change_when_exact_length(self):
        """Test that exact-length audio passes through unchanged."""
        batch_size, embed_dim = 1, 256
        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS

        audio_features = torch.randn(batch_size, expected_tokens, embed_dim)
        padding_embs = torch.zeros(1, 1, embed_dim)

        result, tokens_truncated = adjust_audio_features_to_expected_length(
            audio_features, expected_tokens, padding_embs
        )

        assert result.shape == audio_features.shape
        assert tokens_truncated == 0
        assert torch.allclose(result, audio_features)

    def test_original_bug_would_fail(self):
        """Verify the original buggy implementation would cause overflow.

        The original code always tried to pad, which fails when
        audio_seq_len > expected_tokens because expand() gets negative size.
        """
        batch_size, seq_len, embed_dim = 1, 192, 256
        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS

        padding_embs = torch.zeros(1, 1, embed_dim)

        # Original buggy logic (always pads, never truncates)
        extra_padding_tokens = expected_tokens - seq_len  # = -4 (negative!)

        with pytest.raises(RuntimeError):
            # This should fail with negative size error
            padding_embs.expand(batch_size, extra_padding_tokens, embed_dim)

    @pytest.mark.parametrize(
        "seq_len",
        [50, 100, 150, 187, 188, 189, 192, 200, 300],
    )
    def test_various_audio_lengths(self, seq_len: int):
        """Test padding/truncation with various audio lengths."""
        batch_size, embed_dim = 1, 256
        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS

        audio_features = torch.randn(batch_size, seq_len, embed_dim)
        padding_embs = torch.zeros(1, 1, embed_dim)

        # Should not raise any errors
        result, tokens_truncated = adjust_audio_features_to_expected_length(
            audio_features, expected_tokens, padding_embs
        )

        # Output should always be expected_tokens length
        assert result.shape == (batch_size, expected_tokens, embed_dim)

        # Verify truncation count is correct
        if seq_len > expected_tokens:
            assert tokens_truncated == seq_len - expected_tokens
        else:
            assert tokens_truncated == 0

    def test_batch_processing(self):
        """Test that batch processing works correctly."""
        batch_size, seq_len, embed_dim = 4, 192, 256
        expected_tokens = GEMMA3N_EXPECTED_AUDIO_TOKENS

        audio_features = torch.randn(batch_size, seq_len, embed_dim)
        padding_embs = torch.zeros(1, 1, embed_dim)

        result, tokens_truncated = adjust_audio_features_to_expected_length(
            audio_features, expected_tokens, padding_embs
        )

        assert result.shape == (batch_size, expected_tokens, embed_dim)
        assert tokens_truncated == seq_len - expected_tokens


@pytest.mark.parametrize("model_id", [GEMMA3_MODEL_ID])
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def test_get_image_size_with_most_features(
    image_assets: ImageTestAssets, model_id: str
):
    ctx = build_model_context(
        model_id,
        mm_processor_kwargs={"do_pan_and_scan": True},
        limit_mm_per_prompt={"image": 1},
    )
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)

    hf_processor_mm_kwargs: dict[str, object] = {}
    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)

    max_image_size = processor.info.get_image_size_with_most_features()
    max_tokens = processor.info.get_num_image_tokens(
        image_width=max_image_size.width,
        image_height=max_image_size.height,
        processor=hf_processor,
171
        mm_kwargs=hf_processor_mm_kwargs,
172
173
174
175
176
177
178
    )

    prompt = "<start_of_image>"
    image_seq_length = hf_processor.image_seq_length

    for asset in image_assets:
        mm_data = {"image": [asset.pil_image]}
179
180
181
182
183
        processed_inputs = processor.apply(
            prompt,
            mm_items=processor.info.parse_mm_data(mm_data),
            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
        )
184
185
186
187
        mm_kwargs_data = processed_inputs["mm_kwargs"].get_data()
        num_patches_tensor = mm_kwargs_data["num_patches"]
        tokens = int(num_patches_tensor.item()) * image_seq_length
        assert tokens <= max_tokens