test_hybrid.py 11 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

Mor Zusman's avatar
Mor Zusman committed
4
import pytest
5
import os
Mor Zusman's avatar
Mor Zusman committed
6

7
from tests.utils import multi_gpu_test
8
from vllm.engine.arg_utils import EngineArgs
9
from vllm.sampling_params import SamplingParams
10

11
from ....utils import models_path_prefix
12
13
14
15
16
17
18
from ...utils import check_logprobs_close, check_outputs_equal

# NOTE: The first model in each list is taken as the primary model,
# meaning that it will be used in all tests in this file
# The rest of the models will only be tested by test_models

SSM_MODELS = [
zhuwenwen's avatar
zhuwenwen committed
19
20
    os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"),
    os.path.join(models_path_prefix, "tiiuae/falcon-mamba-tiny-dev"),
21
22
23
24
25
    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
    # Mamba2 is buggy for Codestral as it doesn't handle n_groups.
    # See https://github.com/huggingface/transformers/pull/35943
    # "mistralai/Mamba-Codestral-7B-v0.1",
]
26

27
HYBRID_MODELS = [
zhuwenwen's avatar
zhuwenwen committed
28
    os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"),
29
30
31
    # NOTE: ibm-granite/granite-4.0-tiny-preview are skipped currently as
    # it is not yet available in huggingface transformers
    # "ibm-granite/granite-4.0-tiny-preview",
32
33
34
    # NOTE: Running Plamo2 in transformers implementation requires to install
    # causal-conv1d package, which is not listed as a test dependency as it's
    # not compatible with pip-compile.
zhuwenwen's avatar
zhuwenwen committed
35
36
    os.path.join(models_path_prefix, "pfnet/plamo-2-1b"),
    os.path.join(models_path_prefix, "Zyphra/Zamba2-1.2B-instruct"),
zhuwenwen's avatar
zhuwenwen committed
37
    os.path.join(models_path_prefix, "hmellor/tiny-random-BambaForCausalLM"),
Shinichi Hemmi's avatar
Shinichi Hemmi committed
38
]
39
40
41

# Avoid OOM
MAX_NUM_SEQS = 4
Mor Zusman's avatar
Mor Zusman committed
42
43


44
45
46
@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
Mor Zusman's avatar
Mor Zusman committed
47
48
49
50
51
52
def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    max_tokens: int,
53
    num_logprobs: int,
Mor Zusman's avatar
Mor Zusman committed
54
) -> None:
55
56
57
    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)
Mor Zusman's avatar
Mor Zusman committed
58

59
60
61
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
62

63
64
65
66
67
68
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
Mor Zusman's avatar
Mor Zusman committed
69
70


71
72
73
@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
74
75
76
77
78
def test_batching(
    vllm_runner,
    example_prompts,
    model: str,
    max_tokens: int,
79
    num_logprobs: int,
80
81
) -> None:
    for_loop_outputs = []
82
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
83
        for prompt in example_prompts:
84
85
86
87
            single_output, = vllm_model.generate_greedy_logprobs([prompt],
                                                                 max_tokens,
                                                                 num_logprobs)
            for_loop_outputs.append(single_output)
88

89
90
        batched_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
91

92
    check_logprobs_close(
93
94
95
96
97
98
99
        outputs_0_lst=for_loop_outputs,
        outputs_1_lst=batched_outputs,
        name_0="for_loop_vllm",
        name_1="batched_vllm",
    )


100
101
102
103
104
105
106
107
108
109
110
111
112
113
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
def test_chunked_prefill(
    vllm_runner,
    example_prompts,
    model: str,
    max_tokens: int,
    num_logprobs: int,
    chunked_prefill_token_size: int,
) -> None:
    max_num_seqs = chunked_prefill_token_size
    max_num_batched_tokens = chunked_prefill_token_size
114
115
116

    with vllm_runner(model,
                     enable_chunked_prefill=True,
117
118
119
120
                     max_num_batched_tokens=max_num_batched_tokens,
                     max_num_seqs=max_num_seqs) as vllm_model:
        chunked = vllm_model.generate_greedy_logprobs(example_prompts,
                                                      max_tokens, num_logprobs)
121

122
123
124
125
126
127
128
    with vllm_runner(model,
                     enable_chunked_prefill=False,
                     max_num_seqs=max_num_seqs) as vllm_model:
        non_chunked = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    check_logprobs_close(
129
130
131
132
133
134
135
        outputs_0_lst=chunked,
        outputs_1_lst=non_chunked,
        name_0="chunked",
        name_1="non_chunked",
    )


136
137
138
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [10])
def test_chunked_prefill_with_parallel_sampling(
139
140
141
142
143
    vllm_runner,
    example_prompts,
    model: str,
    max_tokens: int,
) -> None:
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
    """
    Tests chunked prefill in conjunction with n > 1. 
    
    In this case, prefill is populated with decoding tokens and
    we test that it doesn't fail.

    This test might fail if cache is not allocated correctly for n > 1
    decoding steps inside a chunked prefill forward pass
    (where we have both prefill and decode together)
    """
    sampling_params = SamplingParams(n=3,
                                     temperature=1,
                                     seed=0,
                                     max_tokens=max_tokens)
    with vllm_runner(
            model,
            enable_chunked_prefill=True,
            # forces prefill chunks with decoding
            max_num_batched_tokens=MAX_NUM_SEQS * 3,
            max_num_seqs=MAX_NUM_SEQS,
    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)
166
167


168
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
169
170
171
172
173
174
175
@pytest.mark.parametrize("max_tokens", [20])
def test_mamba_cache_cg_padding(
    vllm_runner,
    example_prompts,
    model: str,
    max_tokens: int,
) -> None:
176
177
178
179
180
    """
    This test is for verifying that mamba cache is padded to CG captured
    batch size. If it's not, a torch RuntimeError will be raised because
    tensor dimensions aren't compatible.
    """
Shinichi Hemmi's avatar
Shinichi Hemmi committed
181
182
    vllm_config = EngineArgs(model=model,
                             trust_remote_code=True).create_engine_config()
183
    while len(example_prompts) == vllm_config.pad_for_cudagraph(
184
            len(example_prompts)):
185
186
187
        example_prompts.append(example_prompts[0])

    try:
188
        with vllm_runner(model) as vllm_model:
189
190
191
192
193
194
195
196
            vllm_model.generate_greedy(example_prompts, max_tokens)
    except RuntimeError:
        pytest.fail(
            "Couldn't run batch size which is not equal to a Cuda Graph "
            "captured batch size. "
            "Could be related to mamba cache not padded correctly")


197
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
198
199
200
201
202
203
204
@pytest.mark.parametrize("max_tokens", [20])
def test_models_preemption_recompute(
    vllm_runner,
    example_prompts,
    model: str,
    max_tokens: int,
) -> None:
205
206
207
208
209
210
    """
    Tests that outputs are identical with and w/o preemptions (recompute).
    """
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        scheduler = vllm_model.model.llm_engine.scheduler[0]
        scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
211
212
213
        preempt_vllm_outputs = vllm_model.generate_greedy(
            example_prompts, max_tokens)

214
        scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
215
216
217
218
219
220
221
222
223
224
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=preempt_vllm_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="vllm_preepmtions",
        name_1="vllm",
    )


225
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
226
227
228
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
    vllm_runner,
    example_prompts,
229
    model: str,
230
) -> None:
231
232
233
234
235
236
237
238
239
    """
    This test is for verifying that the hybrid inner state management doesn't
    collapse in case where the number of incoming requests and
    finished_requests_ids is larger than the maximum mamba block capacity.

    This could generally happen due to the fact that hybrid does support
    statelessness mechanism where it can cleanup new incoming requests in
    a single step.
    """
240
    try:
241
        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
242
243
            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
    except ValueError:
Yu Chin Fabian Lim's avatar
Yu Chin Fabian Lim committed
244
        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
245
246
247
                    "steps finished requests registered unnecessarily ")


248
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
Mor Zusman's avatar
Mor Zusman committed
249
250
251
def test_state_cleanup(
    vllm_runner,
    example_prompts,
252
    model: str,
Mor Zusman's avatar
Mor Zusman committed
253
) -> None:
254
255
256
257
258
259
    """ 
    This test is for verifying that the Hybrid state is cleaned up between
    steps.
    
    If its not cleaned, an error would be expected.
    """
Mor Zusman's avatar
Mor Zusman committed
260
    try:
261
        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
Mor Zusman's avatar
Mor Zusman committed
262
263
264
            for _ in range(10):
                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
    except ValueError:
Yu Chin Fabian Lim's avatar
Yu Chin Fabian Lim committed
265
        pytest.fail("Hybrid inner state wasn't cleaned up between states, "
Mor Zusman's avatar
Mor Zusman committed
266
267
268
                    "could be related to finished_requests_ids")


269
270
271
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [64])
def test_multistep_correctness(
Mor Zusman's avatar
Mor Zusman committed
272
    vllm_runner,
273
    example_prompts,
274
275
    model: str,
    max_tokens: int,
Mor Zusman's avatar
Mor Zusman committed
276
) -> None:
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
    with vllm_runner(model, num_scheduler_steps=8,
                     max_num_seqs=2) as vllm_model:
        vllm_outputs_multistep = vllm_model.generate_greedy(
            example_prompts, max_tokens)

    with vllm_runner(model, num_scheduler_steps=1,
                     max_num_seqs=2) as vllm_model:
        vllm_outputs_single_step = vllm_model.generate_greedy(
            example_prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=vllm_outputs_multistep,
        outputs_1_lst=vllm_outputs_single_step,
        name_0="vllm_outputs_multistep",
        name_1="vllm_outputs_single_step",
    )


295
@multi_gpu_test(num_gpus=2)
296
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
297
@pytest.mark.parametrize("max_tokens", [64])
298
299
@pytest.mark.parametrize("num_logprobs", [5])
def test_distributed_correctness(
300
301
302
303
    vllm_runner,
    example_prompts,
    model: str,
    max_tokens: int,
304
    num_logprobs: int,
305
) -> None:
306
    with vllm_runner(model, tensor_parallel_size=1,
307
                     max_num_seqs=2) as vllm_model:
308
309
        vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
310

311
    with vllm_runner(model, tensor_parallel_size=2,
312
                     max_num_seqs=2) as vllm_model:
313
314
        vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
315

316
    check_logprobs_close(
317
318
319
320
321
        outputs_0_lst=vllm_outputs_tp_1,
        outputs_1_lst=vllm_outputs_tp_2,
        name_0="vllm_tp_1",
        name_1="vllm_tp_2",
    )