Re-enable the 80 char line width limit (#3305)

2f8844ba · Zhuohan Li · GitHub · 4b59f00e · 2f8844ba · 2f8844ba
Unverified Commit 2f8844ba authored Mar 10, 2024 by Zhuohan Li Committed by GitHub Mar 10, 2024
20 changed files
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,10 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"

+[tool.ruff]
+# Allow lines to be as long as 80.
+line-length = 80
+
 [tool.ruff.lint]
 select = [
    # pycodestyle
@@ -29,8 +33,6 @@ ignore = [
    "F405", "F403",
    # lambda expression assignment
    "E731",
-    # line too long, handled by black formatting
-    "E501",
    # .strip() with multi-character strings
    "B005",
    # Loop control variable not used within loop body

--- a/setup.py
+++ b/setup.py
@@ -142,8 +142,8 @@ def get_pytorch_rocm_arch() -> Set[str]:
    # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator
    if env_arch_list is None:
        command = "rocm_agent_enumerator"
-        env_arch_list = subprocess.check_output([command]).decode('utf-8')\
-                        .strip().replace("\n", ";")
+        env_arch_list = (subprocess.check_output(
+            [command]).decode('utf-8').strip().replace("\n", ";"))
        arch_source_str = "rocm_agent_enumerator"
    else:
        arch_source_str = "PYTORCH_ROCM_ARCH env variable"

--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -73,7 +73,7 @@ def test_load_chat_template():
    assert template_content is not None
    # Hard coded value for template_chatml.jinja
    assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501


 def test_no_load_chat_template():
@@ -117,4 +117,6 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
        add_generation_prompt=mock_request.add_generation_prompt)

    # Test assertion
-    assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}"
+    assert result == expected_output, (
+        f"The generated prompt does not match the expected output for "
+        f"model {model} and template {template}")
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -4,7 +4,8 @@ from typing import List

 from vllm import SamplingParams
 from vllm.block import PhysicalTokenBlock
-from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus
+from vllm.core.block_manager import (BlockAllocator, BlockSpaceManager,
+                                     AllocStatus)
 from vllm.utils import Device
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob


--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/test_guided_processors.py
@@ -46,8 +46,8 @@ TEST_SCHEMA = {
    "required": ["name", "age", "skills", "work history"]
 }

-TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \
-             r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")


 def test_guided_logits_processors():

--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -5,9 +5,12 @@ import time
 import sys
 import pytest
 import requests
-import ray  # using Ray for overall ease of process management, parallel requests, and debugging.
+# using Ray for overall ease of process management, parallel requests,
+# and debugging.
+import ray
 import openai  # use the official client for correctness check
-from huggingface_hub import snapshot_download  # downloading lora to test lora requests
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download

 # imports for guided decoding tests
 import json
@@ -17,8 +20,11 @@ import re
 from vllm.transformers_utils.tokenizer import get_tokenizer

 MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"  # any model with a chat template should work here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"  # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"

 TEST_SCHEMA = {
    "type": "object",
@@ -59,8 +65,8 @@ TEST_SCHEMA = {
    "required": ["name", "age", "skills", "work history"]
 }

-TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \
-             r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")

 TEST_CHOICE = [
    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
@@ -120,8 +126,9 @@ def server(zephyr_lora_files):
    server_runner = ServerRunner.remote([
        "--model",
        MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
        "--dtype",
-        "bfloat16",  # use half precision for speed and memory savings in CI environment
+        "bfloat16",
        "--max-model-len",
        "8192",
        "--enforce-eager",
@@ -392,7 +399,8 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
        max_tokens=5,
        temperature=0.0,
        extra_body=dict(
-            # NOTE: this has to be true for n > 1 in vLLM, but not necessary for official client.
+            # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+            # for official client.
            use_beam_search=True),
    )
    assert len(batch.choices) == 4
@@ -469,8 +477,8 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
 async def test_guided_json_completion(server, client: openai.AsyncOpenAI):
    completion = await client.completions.create(
        model=MODEL_NAME,
-        prompt=
-        f"Give an example JSON for an employee profile that fits this schema: {TEST_SCHEMA}",
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {TEST_SCHEMA}",
        n=3,
        temperature=1.0,
        max_tokens=500,
@@ -489,8 +497,10 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
        "role": "system",
        "content": "you are a helpful assistant"
    }, {
-        "role": "user",
-        "content": "Give an example JSON for an employee profile that " + \
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
        f"fits this schema: {TEST_SCHEMA}"
    }]
    chat_completion = await client.chat.completions.create(

--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -57,7 +57,8 @@ def test_fused_moe(
                         [torch.float32, torch.float16, torch.bfloat16])
 @torch.inference_mode()
 def test_mixtral_moe(dtype: torch.dtype):
-    "Make sure our Mixtral MoE implementation agrees with the one from huggingface."
+    """Make sure our Mixtral MoE implementation agrees with the one from
+    huggingface."""

    # Instantiate our and huggingface's MoE blocks
    config = MixtralConfig()

--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -114,7 +114,8 @@ def test_contexted_kv_attention(
    v_cache = v_cache.view(-1, block_size, num_kv_heads,
                           head_size).permute(0, 2, 3, 1).contiguous()

-    # Warm up the Triton kernel by calling it once before actually measuring generation time
+    # Warm up the Triton kernel by calling it once before actually measuring
+    # generation time
    context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
                          b_start_loc, b_seq_len, b_ctx_len, max_input_len)
    torch.cuda.synchronize()

--- a/tests/lora/test_layer_variation.py
+++ b/tests/lora/test_layer_variation.py
@@ -11,9 +11,9 @@ from .conftest import cleanup

 MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
 PROMPTS = [
-    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",
-    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",
-    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
 ]



--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -17,14 +17,16 @@ from vllm.lora.layers import (
    LoRAMapping,
    BaseLayerWithLoRA,
 )
-from vllm.lora.models import LoRALayerWeights, convert_mapping, PackedLoRALayerWeights
+from vllm.lora.models import (LoRALayerWeights, convert_mapping,
+                              PackedLoRALayerWeights)
 from vllm.config import LoRAConfig
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               MergedColumnParallelLinear,
                                               RowParallelLinear,
                                               QKVParallelLinear)
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.utils import set_random_seed

 from .utils import DummyLoRAManager
@@ -258,7 +260,8 @@ def test_embeddings(dist_init, num_loras, device) -> None:


 @torch.inference_mode()
-# @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.")
+# @pytest.mark.skip(
+#     reason="Fails when loras are in any slot other than the first.")
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
@@ -674,9 +677,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
            result = linear(input_)[0]
            subloras = sublora_dict[lora_id]
            for i, sublora in enumerate(subloras):
-                result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * (
-                    i + 1
-                )] += input_ @ sublora.lora_a @ sublora.lora_b * sublora.scaling
+                result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] *
+                       (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b *
+                                    sublora.scaling)
            expected_results.append(result)
        expected_result = torch.cat(expected_results)


--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -10,12 +10,12 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"

 def do_sample(llm, lora_path: str, lora_id: int):
    prompts = [
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
    ]
    sampling_params = vllm.SamplingParams(temperature=0,
                                          max_tokens=256,
@@ -48,20 +48,20 @@ def test_llama_lora(sql_lora_files, tp_size):
                   tensor_parallel_size=tp_size)

    expected_no_lora_output = [
-        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",
-        "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",
-        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
+        "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
    ]
    expected_lora_output = [
-        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",
-        "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
-        "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",
-        "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",
-        "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
-        "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "
+        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
+        "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
+        "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
+        "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
+        "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
+        "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
    ]

    print("lora adapter created")
@@ -121,7 +121,8 @@ def test_llama_tensor_parallel_equality(sql_lora_files):


 def test_llama_lora_warmup(sql_lora_files):
-    """Test that the LLM initialization works with a warmup LORA path and is more conservative"""
+    """Test that the LLM initialization works with a warmup LORA path and
+    is more conservative"""

    @ray.remote(num_gpus=1)
    def get_num_gpu_blocks_lora():
@@ -132,13 +133,15 @@ def test_llama_lora_warmup(sql_lora_files):
    @ray.remote(num_gpus=1)
    def get_num_gpu_blocks_no_lora():
        llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
-        num_gpu_blocks_no_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
+        num_gpu_blocks_no_lora_warmup = (
+            llm.llm_engine.cache_config.num_gpu_blocks)
        return num_gpu_blocks_no_lora_warmup

    num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
    num_gpu_blocks_no_lora_warmup = ray.get(
        get_num_gpu_blocks_no_lora.remote())
    assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
-        "The warmup with lora should be more"
-        " conservative than without lora, therefore the number of memory blocks for the KV cache should be "
+        "The warmup with lora should be more "
+        "conservative than without lora, therefore the number of "
+        "memory blocks for the KV cache should be "
        "less when using lora than when not using lora")
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -9,9 +9,9 @@ MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"

 def do_sample(llm, lora_path: str, lora_id: int):
    prompts = [
-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",
-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",
-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
    ]
    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
    outputs = llm.generate(
@@ -42,9 +42,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
                   worker_use_ray=True)

    expected_lora_output = [
-        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",
-        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",
-        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
    ]

    assert do_sample(llm, mixtral_lora_files,

--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -21,7 +21,8 @@ def test_metric_counter_prompt_tokens(
                             gpu_memory_utilization=0.4)
    tokenizer = vllm_model.model.get_tokenizer()
    prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
-    # This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding.
+    # This test needs at least 2 prompts in a batch of different lengths to
+    # verify their token count is correct despite padding.
    assert len(example_prompts) > 1, "at least 2 prompts are required"
    assert prompt_token_counts[0] != prompt_token_counts[1], (
        "prompts of different lengths are required")
@@ -33,8 +34,8 @@ def test_metric_counter_prompt_tokens(
        **stat_logger.labels)._value.get()

    assert vllm_prompt_token_count == metric_count, (
-        f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}"
-    )
+        f"prompt token count: {vllm_prompt_token_count!r}\n"
+        f"metric: {metric_count!r}")


 @pytest.mark.parametrize("model", MODELS)
@@ -60,9 +61,10 @@ def test_metric_counter_generation_tokens(
    for i in range(len(example_prompts)):
        vllm_output_ids, vllm_output_str = vllm_outputs[i]
        prompt_ids = tokenizer.encode(example_prompts[i])
-        # vllm_output_ids contains both prompt tokens and generation tokens. We're interested only in the count of the generation tokens.
+        # vllm_output_ids contains both prompt tokens and generation tokens.
+        # We're interested only in the count of the generation tokens.
        vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)

    assert vllm_generation_count == metric_count, (
-        f"generation token count: {vllm_generation_count!r}\nmetric: {metric_count!r}"
-    )
+        f"generation token count: {vllm_generation_count!r}\n"
+        f"metric: {metric_count!r}")
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -14,7 +14,8 @@ Run `pytest tests/models/test_marlin.py --forked`.
 import pytest
 import torch
 from dataclasses import dataclass
-from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY
+from vllm.model_executor.layers.quantization import (
+    _QUANTIZATION_CONFIG_REGISTRY)

 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
@@ -87,11 +88,11 @@ def test_models(
            if marlin_output_id != gptq_output_id:
                # Each predicted token must be in top 5 of the other's
                assert gptq_output_id in marlin_logprobs[idx], (
-                    f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}"
-                )
+                    f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n"
+                    f"Marlin:\t{marlin_output_str!r}")
                assert marlin_output_id in gptq_logprobs[idx], (
-                    f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}"
-                )
+                    f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n"
+                    f"Marlin:\t{marlin_output_str!r}")

                # Break out since sequences will now diverge.
                break
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -20,20 +20,23 @@ def test_block_allocator(
                                     num_blocks,
                                     enable_caching=True)

-    # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock
+    # Allocate two PysicalTokenBlocks with the same hash and check
+    # that they are the same PhysicalTokenBlock
    first_block = block_allocator.allocate(block_hash, 0)
    second_block = block_allocator.allocate(block_hash, 0)
    assert (first_block == second_block)
    assert (second_block.ref_count == 2)

-    # Free the first_block and confirm that the ref_count is correctly decremented on the second block
+    # Free the first_block and confirm that the ref_count is correctly
+    # decremented on the second block
    block_allocator.free(first_block)
    assert (second_block.ref_count == 1)

    # Free the second block
    block_allocator.free(second_block)

-    # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back
+    # Reallocate the first block and confirm that, even after the block
+    # had its ref_count go to 0, we still get the same block back
    first_block = block_allocator.allocate(block_hash, 0)
    assert (first_block == second_block)
    assert (first_block.block_hash == block_hash)
@@ -56,7 +59,8 @@ def test_eviction(num_blocks: int, ):
    for block in blocks:
        block_allocator.free(block)

-    # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block
+    # Allocate a new block and confirm that it's the first block freed.
+    # I.E The Least Recently Used block
    new_block_hash = block_size
    new_block = block_allocator.allocate(new_block_hash, 0)
    assert (new_block == blocks[0])
@@ -68,7 +72,8 @@ def test_eviction(num_blocks: int, ):
    assert (realloc_block == blocks[realloc_block_hash])
    assert (realloc_block.block_hash == realloc_block_hash)

-    # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list
+    # Allocate a new block and confirm that it's not the realloc_block,
+    # since the realloc_block shouldn't be in the free list
    new_block_hash = block_size + 1
    new_block = block_allocator.allocate(new_block_hash, 0)
    assert (realloc_block != new_block)

--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -70,8 +70,8 @@ def test_get_prompt_logprobs(
                                           hf_logprob[i][-1][token_id].item(),
                                           atol=1e-2,
                                           rtol=1e-2)
-                assert isinstance(sample_logprob.decoded_token, str), \
-                    ("The token should be decoded by the time it is returned "
+                assert isinstance(sample_logprob.decoded_token, str), (
+                    "The token should be decoded by the time it is returned "
                    " to the user.")



--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -255,9 +255,10 @@ def test_sampler_mixed(seed: int, device: str):
            if metadata.sampling_params.use_beam_search:
                continue

-            if metadata.sampling_params.seed is not None \
-                    and expected_tokens[i] is None:
-                # Record seeded random result to compare with results of second invocation
+            if (metadata.sampling_params.seed is not None
+                    and expected_tokens[i] is None):
+                # Record seeded random result to compare with results of
+                # second invocation
                expected_tokens[i] = [
                    nth_output.output_token
                    for nth_output in sequence_output.samples
@@ -265,11 +266,13 @@ def test_sampler_mixed(seed: int, device: str):
                continue

            for n, nth_output in enumerate(sequence_output.samples):
-                if metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None:
+                if (metadata.sampling_params.temperature == 0
+                        or metadata.sampling_params.seed is not None):
                    # Ensure exact matches for greedy or random with seed
                    assert nth_output.output_token == expected_tokens[i][n]
                else:
-                    # For non-seeded random check that one of the high-logit tokens were chosen
+                    # For non-seeded random check that one of the high-logit
+                    # tokens were chosen
                    assert nth_output.output_token in expected_tokens[i]

    # Test batch
@@ -284,8 +287,8 @@ def test_sampler_mixed(seed: int, device: str):
    input_tensor.data = input_tensor.index_select(0, target_index)
    fake_logits.data = fake_logits.index_select(0, target_index)

-    # This time, results of seeded random samples will be compared with the corresponding
-    # sample in the pre-shuffled batch
+    # This time, results of seeded random samples will be compared with
+    # the corresponding sample in the pre-shuffled batch
    test_sampling(model_runner)

    del model_runner

--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -150,8 +150,10 @@ def test_initial_metrics_has_correct_values(has_data: bool):
    assert metrics.emitted_tokens == num_emitted_tokens

    if has_data:
-        assert metrics.draft_acceptance_rate == num_accepted_tokens / num_draft_tokens
-        assert metrics.system_efficiency == num_emitted_tokens / num_possible_tokens
+        assert (metrics.draft_acceptance_rate == num_accepted_tokens /
+                num_draft_tokens)
+        assert (metrics.system_efficiency == num_emitted_tokens /
+                num_possible_tokens)
    else:
        assert math.isnan(metrics.draft_acceptance_rate)
        assert math.isnan(metrics.system_efficiency)
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -3,7 +3,8 @@ import random
 import pytest
 from unittest.mock import MagicMock

-from vllm.spec_decode.multi_step_worker import MultiStepWorker, DraftModelTop1Proposer
+from vllm.spec_decode.multi_step_worker import (MultiStepWorker,
+                                                DraftModelTop1Proposer)
 from vllm.worker.worker import Worker
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplerOutput

--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -4,12 +4,15 @@ import pytest
 from unittest.mock import MagicMock

 from vllm.spec_decode.multi_step_worker import MultiStepWorker
-from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker, split_num_cache_blocks_evenly
+from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
+                                                 split_num_cache_blocks_evenly)
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.model_executor.utils import set_random_seed
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from .utils import mock_worker, create_batch, ExecuteModelData, create_sampler_output_list
-from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics, AsyncMetricsCollector
+from .utils import (mock_worker, create_batch, ExecuteModelData,
+                    create_sampler_output_list)
+from vllm.spec_decode.metrics import (SpecDecodeWorkerMetrics,
+                                      AsyncMetricsCollector)


 @pytest.mark.parametrize('k', [1, 2, 6])
@@ -391,13 +394,15 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):

    mock_rejsample_metrics = MagicMock(
        spec=SpecDecodeWorkerMetrics) if returns_metrics else None
-    metrics_collector.maybe_collect_rejsample_metrics.return_value = mock_rejsample_metrics
+    metrics_collector.maybe_collect_rejsample_metrics.return_value = (
+        mock_rejsample_metrics)

    output = worker.execute_model(**execute_model_data.to_dict(),
                                  num_spec_tokens=k)
    assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics

-    call_args_list = metrics_collector.maybe_collect_rejsample_metrics.call_args_list
+    call_args_list = (
+        metrics_collector.maybe_collect_rejsample_metrics.call_args_list)
    assert len(call_args_list) == 1
    args, kwargs = call_args_list[0]
    assert args[0] == k or kwargs.get('k', -1) == k
@@ -547,7 +552,8 @@ def test_profile_num_available_blocks(available_gpu_blocks: int,

    target_worker.profile_num_available_blocks.return_value = (
        available_gpu_blocks, available_cpu_blocks)
-    target_worker.get_cache_block_size_bytes.return_value = target_cache_block_size_bytes
+    target_worker.get_cache_block_size_bytes.return_value = (
+        target_cache_block_size_bytes)
    draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes

    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,