test_custom_dispatcher.py 602 Bytes
Newer Older
1
2
import os

3
4
from vllm.compilation.levels import CompilationLevel

5
6
from ..utils import compare_two_settings

7
8
9
10
11
# --enforce-eager on TPU causes graph compilation
# this times out default Health Check in the MQLLMEngine,
# so we set the timeout here to 30s
os.environ["VLLM_RPC_TIMEOUT"] = "30000"

12
13

def test_custom_dispatcher():
14
15
16
17
18
19
    compare_two_settings(
        "google/gemma-2b",
        arg1=["--enforce-eager"],
        arg2=["--enforce-eager"],
        env1={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_ONCE)},
        env2={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_AS_IS)})