test_tensorizer.py 16.5 KB
Newer Older
1
import gc
2
3
import json
import os
4
import pathlib
5
import subprocess
6
from functools import partial
7
8
from unittest.mock import MagicMock, patch

9
import openai
10
import pytest
11
import torch
12
from huggingface_hub import snapshot_download
13

zhuwenwen's avatar
zhuwenwen committed
14
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
15
from vllm.engine.arg_utils import EngineArgs
16
# yapf conflicts with isort for this docstring
17
18
19
20
21
22
# yapf: disable
from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                         TensorSerializer,
                                                         is_vllm_tensorized,
                                                         load_with_tensorizer,
                                                         open_stream,
23
24
                                                         serialize_vllm_model,
                                                         tensorize_vllm_model)
25
# yapf: enable
26
from vllm.utils import PlaceholderModule, import_from_path
27

28
from ..utils import VLLM_PATH, RemoteOpenAIServer
29
from .conftest import retry_until_skip
zhuwenwen's avatar
zhuwenwen committed
30
from ..utils import RemoteOpenAIServer, models_path_prefix
31

32
33
34
35
36
37
try:
    from tensorizer import EncryptionParams
except ImportError:
    tensorizer = PlaceholderModule("tensorizer")  # type: ignore[assignment]
    EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")

38
EXAMPLES_PATH = VLLM_PATH / "examples"
39

40
41
42
43
44
45
46
47
48
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)

49
model_ref = os.path.join(models_path_prefix, "facebook/opt-125m")
50
51
tensorize_model_for_testing_script = os.path.join(
    os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py")
52

53

54
55
56
57
58
59
60
def is_curl_installed():
    try:
        subprocess.check_call(['curl', '--version'])
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False

61

62
63
64
65
66
def write_keyfile(keyfile_path: str):
    encryption_params = EncryptionParams.random()
    pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
    with open(keyfile_path, 'wb') as f:
        f.write(encryption_params.key)
67
68


69
@patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
70
71
72
73
74
75
def test_load_with_tensorizer(mock_agent, tensorizer_config):
    mock_linear_method = MagicMock()
    mock_agent_instance = mock_agent.return_value
    mock_agent_instance.deserialize.return_value = MagicMock()

    result = load_with_tensorizer(tensorizer_config,
76
                                  quant_method=mock_linear_method)
77
78

    mock_agent.assert_called_once_with(tensorizer_config,
79
                                       quant_method=mock_linear_method)
80
81
82
83
84
85
    mock_agent_instance.deserialize.assert_called_once()
    assert result == mock_agent_instance.deserialize.return_value


@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
def test_can_deserialize_s3(vllm_runner):
86
    model_ref = os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b")
zhuwenwen's avatar
zhuwenwen committed
87
    tensorized_path = f"{model_ref}/fp16/model.tensors"
88

89
    with vllm_runner(model_ref,
90
91
92
93
94
95
                     load_format="tensorizer",
                     model_loader_extra_config=TensorizerConfig(
                         tensorizer_uri=tensorized_path,
                         num_readers=1,
                         s3_endpoint="object.ord1.coreweave.com",
                     )) as loaded_hf_model:
96
97
        deserialized_outputs = loaded_hf_model.generate(
            prompts, sampling_params)
98
        # noqa: E501
99

100
        assert deserialized_outputs
101
102
103
104
105


@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
def test_deserialized_encrypted_vllm_model_has_same_outputs(
        vllm_runner, tmp_path):
106
107
108
    with vllm_runner(model_ref) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
        key_path = tmp_path / (model_ref + ".key")
109
110
        write_keyfile(key_path)

111
        outputs = vllm_model.generate(prompts, sampling_params)
112

113
114
        config_for_serializing = TensorizerConfig(tensorizer_uri=model_path,
                                                  encryption_keyfile=key_path)
115
116
117
118

        vllm_model.apply_model(
            partial(serialize_vllm_model,
                    tensorizer_config=config_for_serializing))
119
120
121
122

    config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                encryption_keyfile=key_path)

123
124
125
126
    with vllm_runner(model_ref,
                     load_format="tensorizer",
                     model_loader_extra_config=config_for_deserializing
                     ) as loaded_vllm_model:  # noqa: E501
127

128
129
        deserialized_outputs = loaded_vllm_model.generate(
            prompts, sampling_params)
130
        # noqa: E501
131

132
        assert outputs == deserialized_outputs
133
134
135
136


def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
                                                tmp_path):
137
138
139
140
141
142
143
144
    with hf_runner(model_ref) as hf_model:
        model_path = tmp_path / (model_ref + ".tensors")
        max_tokens = 50
        outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
        with open_stream(model_path, "wb+") as stream:
            serializer = TensorSerializer(stream)
            serializer.write_module(hf_model.model)

145
    with vllm_runner(model_ref,
146
147
148
149
150
                     load_format="tensorizer",
                     model_loader_extra_config=TensorizerConfig(
                         tensorizer_uri=model_path,
                         num_readers=1,
                     )) as loaded_hf_model:
151
152
        deserialized_outputs = loaded_hf_model.generate_greedy(
            prompts, max_tokens=max_tokens)
153

154
        assert outputs == deserialized_outputs
155
156


zhuwenwen's avatar
zhuwenwen committed
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
def create_test_prompts(
        lora_path: str
) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
    """Create a list of test prompts with their sampling parameters.

    2 requests for base model, 4 requests for the LoRA. We define 2
    different LoRA adapters (using the same model for demo purposes).
    Since we also set `max_loras=1`, the expectation is that the requests
    with the second LoRA adapter will be ran after all requests with the
    first adapter have finished.
    """
    return [
        ("A robot may not injure a human being",
         SamplingParams(temperature=0.0,
                        logprobs=1,
                        prompt_logprobs=1,
                        max_tokens=128), None),
        ("To be or not to be,",
         SamplingParams(temperature=0.8,
                        top_k=5,
                        presence_penalty=0.2,
                        max_tokens=128), None),
        (
            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
            SamplingParams(temperature=0.0,
                           logprobs=1,
                           prompt_logprobs=1,
                           max_tokens=128,
                           stop_token_ids=[32003]),
            LoRARequest("sql-lora", 1, lora_path)),
        (
            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
            SamplingParams(n=3,
                           best_of=3,
                           use_beam_search=True,
                           temperature=0,
                           max_tokens=128,
                           stop_token_ids=[32003]),
            LoRARequest("sql-lora", 1, lora_path)),
        (
            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
            SamplingParams(temperature=0.0,
                           logprobs=1,
                           prompt_logprobs=1,
                           max_tokens=128,
                           stop_token_ids=[32003]),
            LoRARequest("sql-lora2", 2, lora_path)),
        (
            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
            SamplingParams(n=3,
                           best_of=3,
                           use_beam_search=True,
                           temperature=0,
                           max_tokens=128,
                           stop_token_ids=[32003]),
            LoRARequest("sql-lora", 1, lora_path)),
    ]


def process_requests(engine: LLMEngine,
                     test_prompts: List[Tuple[str, SamplingParams,
                                              Optional[LoRARequest]]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0

    while test_prompts or engine.has_unfinished_requests():
        if test_prompts:
            prompt, sampling_params, lora_request = test_prompts.pop(0)
            engine.add_request(str(request_id),
                               prompt,
                               sampling_params,
                               lora_request=lora_request)
            request_id += 1

        request_outputs: List[RequestOutput] = engine.step()

        for request_output in request_outputs:
            if request_output.finished:
                print(request_output)


238
def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
239
    multilora_inference = import_from_path(
240
241
        "examples.offline_inference.multilora_inference",
        EXAMPLES_PATH / "offline_inference/multilora_inference.py",
242
    )
243
244

    model_ref = "meta-llama/Llama-2-7b-hf"
245
246
    # lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
    lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
247
    test_prompts = multilora_inference.create_test_prompts(lora_path)
248
249

    # Serialize model before deserializing and binding LoRA adapters
250
251
    with vllm_runner(model_ref, ) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
252

253
254
255
256
        vllm_model.apply_model(
            partial(
                serialize_vllm_model,
                tensorizer_config=TensorizerConfig(tensorizer_uri=model_path)))
257

258
    with vllm_runner(
259
260
261
262
263
264
265
266
267
268
269
270
            model_ref,
            load_format="tensorizer",
            model_loader_extra_config=TensorizerConfig(
                tensorizer_uri=model_path,
                num_readers=1,
            ),
            enable_lora=True,
            max_loras=1,
            max_lora_rank=8,
            max_cpu_loras=2,
            max_num_seqs=50,
            max_model_len=1000,
271
    ) as loaded_vllm_model:
272
273
        multilora_inference.process_requests(
            loaded_vllm_model.model.llm_engine, test_prompts)
274

275
        assert loaded_vllm_model
276
277
278


def test_load_without_tensorizer_load_format(vllm_runner):
279
    model = None
280
    with pytest.raises(ValueError):
281
        model = vllm_runner(
282
283
            model_ref,
            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
284
285
286
    del model
    gc.collect()
    torch.cuda.empty_cache()
287
288
289


@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
290
def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
291
    ## Serialize model
292
293
    with vllm_runner(model_ref, ) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
294

295
296
297
298
        vllm_model.apply_model(
            partial(
                serialize_vllm_model,
                tensorizer_config=TensorizerConfig(tensorizer_uri=model_path)))
299

300
301
302
        model_loader_extra_config = {
            "tensorizer_uri": str(model_path),
        }
303

304
305
    ## Start OpenAI API server
    openai_args = [
306
307
308
309
310
        "--dtype",
        "float16",
        "--load-format",
        "tensorizer",
        "--model-loader-extra-config",
311
        json.dumps(model_loader_extra_config),
312
313
    ]

314
    with RemoteOpenAIServer(model_ref, openai_args) as server:
315
        print("Server ready.")
316

317
318
        client = server.get_client()
        completion = client.completions.create(model=model_ref,
319
320
321
                                               prompt="Hello, my name is",
                                               max_tokens=5,
                                               temperature=0.0)
322

323
324
325
326
327
328
        assert completion.id is not None
        assert len(completion.choices) == 1
        assert len(completion.choices[0].text) >= 5
        assert completion.choices[0].finish_reason == "length"
        assert completion.usage == openai.types.CompletionUsage(
            completion_tokens=5, prompt_tokens=6, total_tokens=11)
329
330
331


def test_raise_value_error_on_invalid_load_format(vllm_runner):
332
    model = None
333
    with pytest.raises(ValueError):
334
        model = vllm_runner(
335
336
337
            model_ref,
            load_format="safetensors",
            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
338
339
340
    del model
    gc.collect()
    torch.cuda.empty_cache()
341
342


343
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
344
def test_tensorizer_with_tp_path_without_template(vllm_runner):
345
    with pytest.raises(ValueError):
346
        model_ref = os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b")
zhuwenwen's avatar
zhuwenwen committed
347
        tensorized_path = f"{model_ref}/fp16/model.tensors"
348
349
350
351

        vllm_runner(
            model_ref,
            load_format="tensorizer",
352
353
354
355
356
            model_loader_extra_config=TensorizerConfig(
                tensorizer_uri=tensorized_path,
                num_readers=1,
                s3_endpoint="object.ord1.coreweave.com",
            ),
357
            tensor_parallel_size=2,
358
            disable_custom_all_reduce=True,
359
        )
360

361

362
363
364
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
        vllm_runner, tmp_path):
365
    model_ref = os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b")
366
    # record outputs from un-sharded un-tensorized model
367
368
369
370
371
372
373
    with vllm_runner(
            model_ref,
            disable_custom_all_reduce=True,
            enforce_eager=True,
    ) as base_model:
        outputs = base_model.generate(prompts, sampling_params)
        base_model.model.llm_engine.model_executor.shutdown()
374
375
376
377
378
379
380
381
382
383
384
385

    # load model with two shards and serialize with encryption
    model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
    key_path = tmp_path / (model_ref + ".key")

    tensorizer_config = TensorizerConfig(
        tensorizer_uri=model_path,
        encryption_keyfile=key_path,
    )

    tensorize_vllm_model(
        engine_args=EngineArgs(
386
387
388
389
390
            model=model_ref,
            tensor_parallel_size=2,
            disable_custom_all_reduce=True,
            enforce_eager=True,
        ),
391
392
393
394
395
        tensorizer_config=tensorizer_config,
    )
    assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
    assert os.path.isfile(model_path % 1), "Serialization subprocess failed"

396
397
398
399
400
401
402
    with vllm_runner(
            model_ref,
            tensor_parallel_size=2,
            load_format="tensorizer",
            disable_custom_all_reduce=True,
            enforce_eager=True,
            model_loader_extra_config=tensorizer_config) as loaded_vllm_model:
403
404
        deserialized_outputs = loaded_vllm_model.generate(
            prompts, sampling_params)
405
406
407

    assert outputs == deserialized_outputs

408

409
@retry_until_skip(3)
410
def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
411
412
    gc.collect()
    torch.cuda.empty_cache()
413
    model_ref = os.path.join(models_path_prefix, "facebook/opt-125m")
414
415
416
    model_path = tmp_path / (model_ref + ".tensors")
    config = TensorizerConfig(tensorizer_uri=str(model_path))

417
418
    with vllm_runner(model_ref) as vllm_model:
        outputs = vllm_model.generate(prompts, sampling_params)
419
420
421

        vllm_model.apply_model(
            partial(serialize_vllm_model, tensorizer_config=config))
422

423
        assert is_vllm_tensorized(config)
424

425
    with vllm_runner(model_ref,
426
427
                     load_format="tensorizer",
                     model_loader_extra_config=config) as loaded_vllm_model:
428
429
        deserialized_outputs = loaded_vllm_model.generate(
            prompts, sampling_params)
430
        # noqa: E501
431

432
        assert outputs == deserialized_outputs