"vllm/model_executor/models/minicpm_eagle.py" did not exist on "e489ad7a210f4234db696d1f2749d5f3662fa65b"
test_completions.py 15.8 KB
Newer Older
1
2
3
4
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import io
5
from collections.abc import Sequence
6
7
8
9
10
11
12
from dataclasses import dataclass
from typing import Any

import pybase64
import pytest
import torch

13
14
from vllm.config import ModelConfig
from vllm.inputs import SingletonPrompt
15
16
from vllm.renderers import TokenizeParams
from vllm.renderers.hf import HfRenderer
17
from vllm.renderers.inputs.preprocess import parse_model_prompt, prompt_to_seq
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from vllm.tokenizers.registry import tokenizer_args_from_config

MODEL_NAME = "openai-community/gpt2"


@dataclass
class MockHFConfig:
    model_type: str = "any"


@dataclass
class MockModelConfig:
    runner_type = "generate"
    model: str = MODEL_NAME
    tokenizer: str = MODEL_NAME
    trust_remote_code: bool = False
    tokenizer_revision = None
    tokenizer_mode = "auto"
    hf_config = MockHFConfig()
    encoder_config: dict[str, Any] | None = None
    enable_prompt_embeds: bool = True
    skip_tokenizer_init: bool = False
40
    is_encoder_decoder: bool = False
41
    is_multimodal_model: bool = False
42
43


44
45
46
47
48
@dataclass
class MockParallelConfig:
    _api_process_rank: int = 0


49
50
51
@dataclass
class MockVllmConfig:
    model_config: MockModelConfig
52
    parallel_config: MockParallelConfig
53
54


55
56
57
58
59
60
61
62
63
64
65
66
67
@dataclass
class DummyTokenizer:
    truncation_side: str = "left"
    max_chars_per_token: int = 1

    def __post_init__(self) -> None:
        self._captured_encode_kwargs: dict = {}

    def decode(self, tokens: list[int]):
        return str(tokens)

    def encode(self, text: str, **kwargs):
        self._captured_encode_kwargs = kwargs
68

69
70
71
72
73
        in_length = len(text)
        truncation = kwargs.get("truncation")
        max_length = kwargs.get("max_length")
        if truncation and max_length is not None:
            return list(range(min(in_length, max_length)))
74

75
        return list(range(in_length))
76
77


78
79
80
81
82
83
84
def _build_renderer(
    model_config: MockModelConfig,
    *,
    truncation_side: str = "left",
    max_chars_per_token: int = 1,
):
    _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config)
85

86
    renderer = HfRenderer(
87
        MockVllmConfig(model_config, parallel_config=MockParallelConfig()),
88
89
90
91
92
93
94
95
        tokenizer=(
            None
            if model_config.skip_tokenizer_init
            else DummyTokenizer(
                truncation_side=truncation_side,
                max_chars_per_token=max_chars_per_token,
            )
        ),
96
97
    )

98
99
    return renderer

100

101
def _preprocess_prompt(
102
    model_config: ModelConfig,
103
104
105
106
107
108
    prompt_or_prompts: SingletonPrompt | bytes | Sequence[SingletonPrompt | bytes],
):
    return [
        (
            prompt
            if isinstance(prompt, bytes)
109
            else parse_model_prompt(model_config, prompt)
110
111
        )
        for prompt in prompt_to_seq(prompt_or_prompts)
112
113
114
    ]


115
class TestValidatePrompt:
116
117
118
    def test_empty_input(self):
        renderer = _build_renderer(MockModelConfig())

119
        with pytest.raises(ValueError, match="at least one prompt"):
120
            renderer.render_prompts(_preprocess_prompt(renderer.model_config, []))
121

122
123
124
    def test_invalid_type(self):
        renderer = _build_renderer(MockModelConfig())

125
126
        with pytest.raises(TypeError, match="should be a list of integers"):
            renderer.render_prompts(
127
                _preprocess_prompt(renderer.model_config, [[1, 2], ["foo", "bar"]])  # type: ignore[arg-type]
128
            )
129
130
131


class TestRenderPrompt:
132
133
134
    def test_token_input(self):
        renderer = _build_renderer(MockModelConfig())

135
        tokens = [101, 7592, 2088]
136
137
138
        prompts = renderer.render_prompts(
            _preprocess_prompt(renderer.model_config, tokens)
        )
139
        results = renderer.tokenize_prompts(
140
141
142
143
144
145
146
            prompts,
            TokenizeParams(max_total_tokens=100),
        )

        assert len(results) == 1
        assert results[0]["prompt_token_ids"] == tokens

147
148
149
    def test_token_list_input(self):
        renderer = _build_renderer(MockModelConfig())

150
        token_lists = [[101, 7592, 2088], [102, 1234, 5678, 9012], [103, 4567]]
151
        prompts = renderer.render_prompts(
152
            _preprocess_prompt(renderer.model_config, token_lists)
153
        )
154
        results = renderer.tokenize_prompts(
155
156
157
158
159
160
161
162
163
            prompts,
            TokenizeParams(max_total_tokens=100),
        )

        assert len(results) == 3
        assert results[0]["prompt_token_ids"] == [101, 7592, 2088]
        assert results[1]["prompt_token_ids"] == [102, 1234, 5678, 9012]
        assert results[2]["prompt_token_ids"] == [103, 4567]

164
165
    def test_text_input(self):
        renderer = _build_renderer(MockModelConfig())
166

167
        text_input = "x" * 10
168
        prompts = renderer.render_prompts(
169
            _preprocess_prompt(renderer.model_config, text_input)
170
        )
171
        results = renderer.tokenize_prompts(
172
173
174
175
176
            prompts,
            TokenizeParams(max_total_tokens=100),
        )

        assert len(results) == 1
177
        assert len(results[0]["prompt_token_ids"]) == 10
178

179
180
    def test_text_list_input(self):
        renderer = _build_renderer(MockModelConfig())
181

182
        text_list_input = ["x" * 10, "x" * 12, "x" * 14]
183
        prompts = renderer.render_prompts(
184
            _preprocess_prompt(renderer.model_config, text_list_input)
185
        )
186
        results = renderer.tokenize_prompts(
187
188
189
190
191
            prompts,
            TokenizeParams(max_total_tokens=100),
        )

        assert len(results) == 3
192
193
        for text_input, result in zip(text_list_input, results):
            assert len(result["prompt_token_ids"]) == len(text_input)
194

195
196
    def test_zero_truncation(self):
        renderer = _build_renderer(MockModelConfig())
197

198
        prompts = renderer.render_prompts(
199
            _preprocess_prompt(renderer.model_config, "x" * 200)
200
        )
201
        results = renderer.tokenize_prompts(
202
            prompts,
203
            TokenizeParams(max_total_tokens=100, truncate_prompt_tokens=0),
204
205
206
        )

        assert len(results) == 1
207
        assert len(results[0]["prompt_token_ids"]) == 0
208

209
210
    def test_pos_truncation(self):
        renderer = _build_renderer(MockModelConfig())
211

212
        prompts = renderer.render_prompts(
213
            _preprocess_prompt(renderer.model_config, "x" * 200)
214
        )
215
        results = renderer.tokenize_prompts(
216
            prompts,
217
            TokenizeParams(max_total_tokens=100, truncate_prompt_tokens=50),
218
219
220
        )

        assert len(results) == 1
221
222
223
224
225
        assert len(results[0]["prompt_token_ids"]) == 50

    def test_neg_truncation(self):
        renderer = _build_renderer(MockModelConfig())

226
        prompts = renderer.render_prompts(
227
            _preprocess_prompt(renderer.model_config, "x" * 200)
228
        )
229
        results = renderer.tokenize_prompts(
230
            prompts,
231
            TokenizeParams(max_total_tokens=100, truncate_prompt_tokens=-1),
232
233
234
        )

        assert len(results) == 1
235
236
237
238
        assert len(results[0]["prompt_token_ids"]) == 100  # max_total_tokens

    def test_truncation_left(self):
        renderer = _build_renderer(MockModelConfig(), truncation_side="left")
239
240

        long_tokens = [100, 101, 102, 103, 104, 105, 106, 107, 108, 109]  # 10 tokens
241
        prompts = renderer.render_prompts(
242
            _preprocess_prompt(renderer.model_config, long_tokens)
243
        )
244
        results = renderer.tokenize_prompts(
245
            prompts,
246
            TokenizeParams(max_total_tokens=100, truncate_prompt_tokens=5),
247
248
249
250
251
252
        )

        assert len(results) == 1
        # Should keep the last 5 tokens: [105, 106, 107, 108, 109]
        assert results[0]["prompt_token_ids"] == [105, 106, 107, 108, 109]

253
254
255
256
    def test_truncation_right(self):
        renderer = _build_renderer(MockModelConfig(), truncation_side="right")

        long_tokens = [100, 101, 102, 103, 104, 105, 106, 107, 108, 109]  # 10 tokens
257
        prompts = renderer.render_prompts(
258
            _preprocess_prompt(renderer.model_config, long_tokens)
259
        )
260
261
262
263
        results = renderer.tokenize_prompts(
            prompts,
            TokenizeParams(max_total_tokens=100, truncate_prompt_tokens=5),
        )
264

265
266
267
        assert len(results) == 1
        # Should keep the first 5 tokens: [100, 101, 102, 103, 104]
        assert results[0]["prompt_token_ids"] == [100, 101, 102, 103, 104]
268

269
270
271
272
273
    def test_text_max_length_exceeded_obvious(self):
        renderer = _build_renderer(MockModelConfig(), max_chars_per_token=1)

        # Exceeds max_total_tokens and max_total_tokens * VLLM_MAX_CHARS_PER_TOKEN
        long_tokens = "x" * 150
274
        prompts = renderer.render_prompts(
275
            _preprocess_prompt(renderer.model_config, long_tokens)
276
        )
277
278
279

        with pytest.raises(
            ValueError,
280
            match="maximum context length is",
281
282
        ):
            renderer.tokenize_prompts(
283
284
285
286
                prompts,
                TokenizeParams(max_total_tokens=100),
            )

287
        # Should not even attempt tokenization
288
        assert renderer.tokenizer._captured_encode_kwargs == {}
289
290
291
292
293
294

    def test_text_max_length_exceeded_nonobvious(self):
        renderer = _build_renderer(MockModelConfig(), max_chars_per_token=2)

        # Exceeds max_total_tokens but not max_total_tokens * VLLM_MAX_CHARS_PER_TOKEN
        long_tokens = "x" * 150
295
        prompts = renderer.render_prompts(
296
            _preprocess_prompt(renderer.model_config, long_tokens)
297
        )
298
299
300

        with pytest.raises(
            ValueError,
301
            match="maximum context length is",
302
303
304
305
306
307
308
        ):
            renderer.tokenize_prompts(
                prompts,
                TokenizeParams(max_total_tokens=100),
            )

        # Should only tokenize the first max_total_tokens + 1 tokens
309
310
        assert renderer.tokenizer._captured_encode_kwargs["truncation"] is True
        assert renderer.tokenizer._captured_encode_kwargs["max_length"] == 101
311
312
313
314
315

    def test_token_max_length_exceeded(self):
        renderer = _build_renderer(MockModelConfig())

        long_tokens = list(range(150))  # Exceeds max_total_tokens=100
316
        prompts = renderer.render_prompts(
317
            _preprocess_prompt(renderer.model_config, long_tokens)
318
        )
319
320
321

        with pytest.raises(
            ValueError,
322
            match="maximum context length is",
323
324
325
326
327
328
329
330
        ):
            renderer.tokenize_prompts(
                prompts,
                TokenizeParams(max_total_tokens=100, truncate_prompt_tokens=None),
            )

    def test_no_tokenizer_for_text(self):
        renderer = _build_renderer(MockModelConfig(skip_tokenizer_init=True))
331

332
        prompts = renderer.render_prompts(
333
            _preprocess_prompt(renderer.model_config, "Hello world")
334
        )
335
336

        with pytest.raises(ValueError, match="`skip_tokenizer_init=True`"):
337
            renderer.tokenize_prompts(
338
339
340
341
                prompts,
                TokenizeParams(max_total_tokens=100),
            )

342
343
    def test_token_input_with_needs_detokenization(self):
        renderer = _build_renderer(MockModelConfig())
344
345

        tokens = [1, 2, 3, 4]
346
347
348
        prompts = renderer.render_prompts(
            _preprocess_prompt(renderer.model_config, tokens)
        )
349
        results = renderer.tokenize_prompts(
350
351
            prompts,
            TokenizeParams(
352
                max_total_tokens=100,
353
354
355
356
357
358
                needs_detokenization=True,
            ),
        )

        assert len(results) == 1
        assert results[0]["prompt_token_ids"] == tokens
359
        assert results[0]["prompt"] == "[1, 2, 3, 4]"
360
361
362
363
364
365
366
367
368
369


class TestRenderEmbedPrompt:
    def _create_test_embed_bytes(self, tensor: torch.Tensor) -> bytes:
        """Helper to create base64-encoded tensor bytes"""
        buffer = io.BytesIO()
        torch.save(tensor, buffer)
        buffer.seek(0)
        return pybase64.b64encode(buffer.read())

370
371
372
    def test_single_prompt_embed(self):
        renderer = _build_renderer(MockModelConfig())

373
        # Create a test tensor
374
375
        tensor_input = torch.randn(10, 768, dtype=torch.float32)
        embed_bytes = self._create_test_embed_bytes(tensor_input)
376

377
        prompts = renderer.render_prompts(
378
            _preprocess_prompt(renderer.model_config, embed_bytes)
379
        )
380
        results = renderer.tokenize_prompts(
381
            prompts,
382
            TokenizeParams(max_total_tokens=100),
383
384
385
        )

        assert len(results) == 1
386
387
388
389
        assert torch.equal(results[0]["prompt_embeds"], tensor_input)

    def test_multiple_prompt_embeds(self):
        renderer = _build_renderer(MockModelConfig())
390
391

        # Create multiple test tensors
392
        tensor_inputs = [
393
394
395
396
            torch.randn(8, 512, dtype=torch.float32),
            torch.randn(12, 512, dtype=torch.float32),
        ]

397
398
        prompts = renderer.render_prompts(
            _preprocess_prompt(
399
                renderer.model_config,
400
401
                [self._create_test_embed_bytes(t) for t in tensor_inputs],
            )
402
        )
403
        results = renderer.tokenize_prompts(
404
            prompts,
405
            TokenizeParams(max_total_tokens=100),
406
407
408
409
        )

        assert len(results) == 2
        for i, result in enumerate(results):
410
411
412
413
            assert torch.allclose(result["prompt_embeds"], tensor_inputs[i])

    def test_prompt_embed_truncation(self):
        renderer = _build_renderer(MockModelConfig())
414
415

        # Create tensor with more tokens than truncation limit
416
        tensor_input = torch.randn(20, 768, dtype=torch.float32)
417

418
419
        prompts = renderer.render_prompts(
            _preprocess_prompt(
420
                renderer.model_config, self._create_test_embed_bytes(tensor_input)
421
            )
422
423
        )
        results = renderer.tokenize_prompts(
424
425
            prompts,
            TokenizeParams(
426
                max_total_tokens=100,
427
428
429
430
431
432
                truncate_prompt_tokens=10,
            ),
        )

        assert len(results) == 1
        # Should keep last 10 tokens
433
434
435
436
437
        expected = tensor_input[-10:]
        assert torch.equal(results[0]["prompt_embeds"], expected)

    def test_prompt_embed_different_dtypes(self):
        renderer = _build_renderer(MockModelConfig())
438
439
440
441
442

        # Test different supported dtypes
        dtypes = [torch.float32, torch.float16, torch.bfloat16]

        for dtype in dtypes:
443
            tensor_input = torch.randn(5, 256, dtype=dtype)
444

445
446
            prompts = renderer.render_prompts(
                _preprocess_prompt(
447
                    renderer.model_config, self._create_test_embed_bytes(tensor_input)
448
                )
449
450
            )
            results = renderer.tokenize_prompts(
451
                prompts,
452
                TokenizeParams(max_total_tokens=100),
453
454
455
456
457
            )

            assert len(results) == 1
            assert results[0]["prompt_embeds"].dtype == dtype

458
459
460
    def test_prompt_embed_squeeze_batch_dim(self):
        renderer = _build_renderer(MockModelConfig())

461
        # Test tensor with batch dimension gets squeezed
462
        tensor_input = torch.randn(1, 10, 768, dtype=torch.float32)
463

464
465
        prompts = renderer.render_prompts(
            _preprocess_prompt(
466
                renderer.model_config, self._create_test_embed_bytes(tensor_input)
467
            )
468
469
        )
        results = renderer.tokenize_prompts(
470
            prompts,
471
            TokenizeParams(max_total_tokens=100),
472
473
474
475
476
477
        )

        assert len(results) == 1
        # Should be squeezed to 2D
        assert results[0]["prompt_embeds"].shape == (10, 768)

478
479
    def test_both_prompts_and_embeds(self):
        renderer = _build_renderer(MockModelConfig())
480

481
482
        text_input = "Hello world"
        tensor_input = torch.randn(5, 256, dtype=torch.float32)
483

484
485
        prompts = renderer.render_prompts(
            _preprocess_prompt(
486
                renderer.model_config,
487
488
                [text_input, self._create_test_embed_bytes(tensor_input)],
            )
489
        )
490
        results = renderer.tokenize_prompts(
491
            prompts,
492
            TokenizeParams(max_total_tokens=100),
493
494
495
        )

        assert len(results) == 2
496
497
498
499
500
        # First should be tokens prompt
        assert "prompt_token_ids" in results[0]
        assert len(results[0]["prompt_token_ids"]) == len(text_input)
        # Second should be embed prompt
        assert torch.equal(results[1]["prompt_embeds"], tensor_input)