test_tensorizer.py 16.5 KB
Newer Older
1
import gc
2
3
import json
import os
4
import pathlib
5
6
7
import subprocess
from unittest.mock import MagicMock, patch

8
import openai
9
import pytest
10
import torch
11
from huggingface_hub import snapshot_download
12

zhuwenwen's avatar
zhuwenwen committed
13
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
14
from vllm.engine.arg_utils import EngineArgs
15
# yapf conflicts with isort for this docstring
16
17
18
19
20
21
# yapf: disable
from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                         TensorSerializer,
                                                         is_vllm_tensorized,
                                                         load_with_tensorizer,
                                                         open_stream,
22
23
                                                         serialize_vllm_model,
                                                         tensorize_vllm_model)
24
# yapf: enable
25
from vllm.utils import PlaceholderModule, import_from_path
26

27
from ..conftest import VllmRunner
28
from ..utils import VLLM_PATH, RemoteOpenAIServer
29
from .conftest import retry_until_skip
zhuwenwen's avatar
zhuwenwen committed
30
from ..utils import RemoteOpenAIServer, models_path_prefix
31

32
33
34
35
36
37
try:
    from tensorizer import EncryptionParams
except ImportError:
    tensorizer = PlaceholderModule("tensorizer")  # type: ignore[assignment]
    EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")

38
EXAMPLES_PATH = VLLM_PATH / "examples"
39

40
41
42
43
44
45
46
47
48
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)

49
model_ref = os.path.join(models_path_prefix, "facebook/opt-125m")
50
51
tensorize_model_for_testing_script = os.path.join(
    os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py")
52

53

54
55
56
57
58
59
60
def is_curl_installed():
    try:
        subprocess.check_call(['curl', '--version'])
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False

61

62
63
def get_torch_model(vllm_runner: VllmRunner):
    return vllm_runner \
64
65
66
67
68
69
70
        .model \
        .llm_engine \
        .model_executor \
        .driver_worker \
        .model_runner \
        .model

71
72
73
74
75
76

def write_keyfile(keyfile_path: str):
    encryption_params = EncryptionParams.random()
    pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
    with open(keyfile_path, 'wb') as f:
        f.write(encryption_params.key)
77
78


79
@patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
80
81
82
83
84
85
def test_load_with_tensorizer(mock_agent, tensorizer_config):
    mock_linear_method = MagicMock()
    mock_agent_instance = mock_agent.return_value
    mock_agent_instance.deserialize.return_value = MagicMock()

    result = load_with_tensorizer(tensorizer_config,
86
                                  quant_method=mock_linear_method)
87
88

    mock_agent.assert_called_once_with(tensorizer_config,
89
                                       quant_method=mock_linear_method)
90
91
92
93
94
95
    mock_agent_instance.deserialize.assert_called_once()
    assert result == mock_agent_instance.deserialize.return_value


@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
def test_can_deserialize_s3(vllm_runner):
96
    model_ref = os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b")
zhuwenwen's avatar
zhuwenwen committed
97
    tensorized_path = f"{model_ref}/fp16/model.tensors"
98

99
    with vllm_runner(model_ref,
100
101
102
103
104
105
                     load_format="tensorizer",
                     model_loader_extra_config=TensorizerConfig(
                         tensorizer_uri=tensorized_path,
                         num_readers=1,
                         s3_endpoint="object.ord1.coreweave.com",
                     )) as loaded_hf_model:
106
107
        deserialized_outputs = loaded_hf_model.generate(
            prompts, sampling_params)
108
        # noqa: E501
109

110
        assert deserialized_outputs
111
112
113
114
115


@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
def test_deserialized_encrypted_vllm_model_has_same_outputs(
        vllm_runner, tmp_path):
116
117
118
    with vllm_runner(model_ref) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
        key_path = tmp_path / (model_ref + ".key")
119
120
        write_keyfile(key_path)

121
        outputs = vllm_model.generate(prompts, sampling_params)
122

123
124
        config_for_serializing = TensorizerConfig(tensorizer_uri=model_path,
                                                  encryption_keyfile=key_path)
125
        serialize_vllm_model(get_torch_model(vllm_model),
126
                             config_for_serializing)
127
128
129
130

    config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                encryption_keyfile=key_path)

131
132
133
134
    with vllm_runner(model_ref,
                     load_format="tensorizer",
                     model_loader_extra_config=config_for_deserializing
                     ) as loaded_vllm_model:  # noqa: E501
135

136
137
        deserialized_outputs = loaded_vllm_model.generate(
            prompts, sampling_params)
138
        # noqa: E501
139

140
        assert outputs == deserialized_outputs
141
142
143
144


def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
                                                tmp_path):
145
146
147
148
149
150
151
152
    with hf_runner(model_ref) as hf_model:
        model_path = tmp_path / (model_ref + ".tensors")
        max_tokens = 50
        outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
        with open_stream(model_path, "wb+") as stream:
            serializer = TensorSerializer(stream)
            serializer.write_module(hf_model.model)

153
    with vllm_runner(model_ref,
154
155
156
157
158
                     load_format="tensorizer",
                     model_loader_extra_config=TensorizerConfig(
                         tensorizer_uri=model_path,
                         num_readers=1,
                     )) as loaded_hf_model:
159
160
        deserialized_outputs = loaded_hf_model.generate_greedy(
            prompts, max_tokens=max_tokens)
161

162
        assert outputs == deserialized_outputs
163
164


zhuwenwen's avatar
zhuwenwen committed
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def create_test_prompts(
        lora_path: str
) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
    """Create a list of test prompts with their sampling parameters.

    2 requests for base model, 4 requests for the LoRA. We define 2
    different LoRA adapters (using the same model for demo purposes).
    Since we also set `max_loras=1`, the expectation is that the requests
    with the second LoRA adapter will be ran after all requests with the
    first adapter have finished.
    """
    return [
        ("A robot may not injure a human being",
         SamplingParams(temperature=0.0,
                        logprobs=1,
                        prompt_logprobs=1,
                        max_tokens=128), None),
        ("To be or not to be,",
         SamplingParams(temperature=0.8,
                        top_k=5,
                        presence_penalty=0.2,
                        max_tokens=128), None),
        (
            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
            SamplingParams(temperature=0.0,
                           logprobs=1,
                           prompt_logprobs=1,
                           max_tokens=128,
                           stop_token_ids=[32003]),
            LoRARequest("sql-lora", 1, lora_path)),
        (
            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
            SamplingParams(n=3,
                           best_of=3,
                           use_beam_search=True,
                           temperature=0,
                           max_tokens=128,
                           stop_token_ids=[32003]),
            LoRARequest("sql-lora", 1, lora_path)),
        (
            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
            SamplingParams(temperature=0.0,
                           logprobs=1,
                           prompt_logprobs=1,
                           max_tokens=128,
                           stop_token_ids=[32003]),
            LoRARequest("sql-lora2", 2, lora_path)),
        (
            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
            SamplingParams(n=3,
                           best_of=3,
                           use_beam_search=True,
                           temperature=0,
                           max_tokens=128,
                           stop_token_ids=[32003]),
            LoRARequest("sql-lora", 1, lora_path)),
    ]


def process_requests(engine: LLMEngine,
                     test_prompts: List[Tuple[str, SamplingParams,
                                              Optional[LoRARequest]]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0

    while test_prompts or engine.has_unfinished_requests():
        if test_prompts:
            prompt, sampling_params, lora_request = test_prompts.pop(0)
            engine.add_request(str(request_id),
                               prompt,
                               sampling_params,
                               lora_request=lora_request)
            request_id += 1

        request_outputs: List[RequestOutput] = engine.step()

        for request_output in request_outputs:
            if request_output.finished:
                print(request_output)


246
def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
247
248
249
250
    multilora_inference = import_from_path(
        "examples.multilora_inference",
        EXAMPLES_PATH / "multilora_inference.py",
    )
251
252

    model_ref = "meta-llama/Llama-2-7b-hf"
253
254
    # lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
    lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
255
    test_prompts = multilora_inference.create_test_prompts(lora_path)
256
257

    # Serialize model before deserializing and binding LoRA adapters
258
259
    with vllm_runner(model_ref, ) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
260

261
        serialize_vllm_model(get_torch_model(vllm_model),
262
                             TensorizerConfig(tensorizer_uri=model_path))
263

264
    with vllm_runner(
265
266
267
268
269
270
271
272
273
274
275
276
            model_ref,
            load_format="tensorizer",
            model_loader_extra_config=TensorizerConfig(
                tensorizer_uri=model_path,
                num_readers=1,
            ),
            enable_lora=True,
            max_loras=1,
            max_lora_rank=8,
            max_cpu_loras=2,
            max_num_seqs=50,
            max_model_len=1000,
277
    ) as loaded_vllm_model:
278
279
        multilora_inference.process_requests(
            loaded_vllm_model.model.llm_engine, test_prompts)
280

281
        assert loaded_vllm_model
282
283
284


def test_load_without_tensorizer_load_format(vllm_runner):
285
    model = None
286
    with pytest.raises(ValueError):
287
        model = vllm_runner(
288
289
            model_ref,
            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
290
291
292
    del model
    gc.collect()
    torch.cuda.empty_cache()
293
294
295


@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
296
def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
297
    ## Serialize model
298
299
    with vllm_runner(model_ref, ) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
300

301
        serialize_vllm_model(get_torch_model(vllm_model),
302
                             TensorizerConfig(tensorizer_uri=model_path))
303

304
305
306
        model_loader_extra_config = {
            "tensorizer_uri": str(model_path),
        }
307

308
309
    ## Start OpenAI API server
    openai_args = [
310
311
312
313
314
        "--dtype",
        "float16",
        "--load-format",
        "tensorizer",
        "--model-loader-extra-config",
315
        json.dumps(model_loader_extra_config),
316
317
    ]

318
    with RemoteOpenAIServer(model_ref, openai_args) as server:
319
        print("Server ready.")
320

321
322
        client = server.get_client()
        completion = client.completions.create(model=model_ref,
323
324
325
                                               prompt="Hello, my name is",
                                               max_tokens=5,
                                               temperature=0.0)
326

327
328
329
330
331
332
        assert completion.id is not None
        assert len(completion.choices) == 1
        assert len(completion.choices[0].text) >= 5
        assert completion.choices[0].finish_reason == "length"
        assert completion.usage == openai.types.CompletionUsage(
            completion_tokens=5, prompt_tokens=6, total_tokens=11)
333
334
335


def test_raise_value_error_on_invalid_load_format(vllm_runner):
336
    model = None
337
    with pytest.raises(ValueError):
338
        model = vllm_runner(
339
340
341
            model_ref,
            load_format="safetensors",
            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
342
343
344
    del model
    gc.collect()
    torch.cuda.empty_cache()
345
346


347
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
348
def test_tensorizer_with_tp_path_without_template(vllm_runner):
349
    with pytest.raises(ValueError):
350
        model_ref = os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b")
zhuwenwen's avatar
zhuwenwen committed
351
        tensorized_path = f"{model_ref}/fp16/model.tensors"
352
353
354
355

        vllm_runner(
            model_ref,
            load_format="tensorizer",
356
357
358
359
360
            model_loader_extra_config=TensorizerConfig(
                tensorizer_uri=tensorized_path,
                num_readers=1,
                s3_endpoint="object.ord1.coreweave.com",
            ),
361
            tensor_parallel_size=2,
362
            disable_custom_all_reduce=True,
363
        )
364

365

366
367
368
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
        vllm_runner, tmp_path):
369
    model_ref = os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b")
370
    # record outputs from un-sharded un-tensorized model
371
372
373
374
375
376
377
    with vllm_runner(
            model_ref,
            disable_custom_all_reduce=True,
            enforce_eager=True,
    ) as base_model:
        outputs = base_model.generate(prompts, sampling_params)
        base_model.model.llm_engine.model_executor.shutdown()
378
379
380
381
382
383
384
385
386
387
388
389

    # load model with two shards and serialize with encryption
    model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
    key_path = tmp_path / (model_ref + ".key")

    tensorizer_config = TensorizerConfig(
        tensorizer_uri=model_path,
        encryption_keyfile=key_path,
    )

    tensorize_vllm_model(
        engine_args=EngineArgs(
390
391
392
393
394
            model=model_ref,
            tensor_parallel_size=2,
            disable_custom_all_reduce=True,
            enforce_eager=True,
        ),
395
396
397
398
399
        tensorizer_config=tensorizer_config,
    )
    assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
    assert os.path.isfile(model_path % 1), "Serialization subprocess failed"

400
401
402
403
404
405
406
    with vllm_runner(
            model_ref,
            tensor_parallel_size=2,
            load_format="tensorizer",
            disable_custom_all_reduce=True,
            enforce_eager=True,
            model_loader_extra_config=tensorizer_config) as loaded_vllm_model:
407
408
        deserialized_outputs = loaded_vllm_model.generate(
            prompts, sampling_params)
409
410
411

    assert outputs == deserialized_outputs

412

413
@retry_until_skip(3)
414
def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
415
416
    gc.collect()
    torch.cuda.empty_cache()
417
    model_ref = os.path.join(models_path_prefix, "facebook/opt-125m")
418
419
420
    model_path = tmp_path / (model_ref + ".tensors")
    config = TensorizerConfig(tensorizer_uri=str(model_path))

421
422
    with vllm_runner(model_ref) as vllm_model:
        outputs = vllm_model.generate(prompts, sampling_params)
423
        serialize_vllm_model(get_torch_model(vllm_model), config)
424

425
        assert is_vllm_tensorized(config)
426

427
    with vllm_runner(model_ref,
428
429
                     load_format="tensorizer",
                     model_loader_extra_config=config) as loaded_vllm_model:
430
431
        deserialized_outputs = loaded_vllm_model.generate(
            prompts, sampling_params)
432
        # noqa: E501
433

434
        assert outputs == deserialized_outputs