[CI/Build] drop support for Python 3.8 EOL (#8464)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

[CI/Build] drop support for Python 3.8 EOL (#8464)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
21063c11 · Aaron Pham · GitHub · 4be3a451 · 21063c11 · 21063c11
Unverified Commit 21063c11 authored Nov 06, 2024 by Aaron Pham Committed by GitHub Nov 06, 2024
20 changed files
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -56,7 +56,7 @@ serving_column_mapping = {
 def read_markdown(file):
    if os.path.exists(file):
-        with open(file, "r") as f:
+        with open(file) as f:
            return f.read() + "\n"
    else:
        return f"{file} not found.\n"
@@ -75,14 +75,14 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            raw_result = json.loads(f.read())
        if "serving" in str(test_file):
            # this result is generated via `benchmark_serving.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)
@@ -97,7 +97,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_latency.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)
@@ -119,7 +119,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_throughput.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)

--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -72,7 +72,7 @@ def main(args):
    # collect results
    for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            results = results + json.loads(f.read())
    # generate markdown table
@@ -80,7 +80,7 @@ def main(args):
    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
-    with open(args.description, "r") as f:
+    with open(args.description) as f:
        description = f.read()
    description = description.format(

--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -36,11 +36,11 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            raw_result = json.loads(f.read())
        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
+        with open(test_file.with_suffix(".commands")) as f:
            command = json.loads(f.read())
        raw_result.update(command)

--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -25,7 +25,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
    steps:
    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
    - name: Set up Python ${{ matrix.python-version }}

--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -48,7 +48,7 @@ jobs:
      fail-fast: false
      matrix:
          os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          python-version: ['3.9', '3.10', '3.11', '3.12']
          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
          cuda-version: ['11.8', '12.1']

--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -6,7 +6,7 @@ version: 2
 build:
  os: ubuntu-22.04
  tools:
-    python: "3.8"
+    python: '3.9'
 sphinx:
  configuration: docs/source/conf.py
@@ -19,4 +19,3 @@ formats: []
 python:
  install:
    - requirements: docs/requirements-docs.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -79,7 +79,7 @@ async def async_request_tgi(
                        # any data, we should skip it.
                        if chunk_bytes.startswith(":"):
                            continue
-                        chunk = remove_prefix(chunk_bytes, "data:")
+                        chunk = chunk_bytes.removeprefix("data:")
                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
@@ -144,7 +144,7 @@ async def async_request_trt_llm(
                        if not chunk_bytes:
                            continue
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data:")
                        data = json.loads(chunk)
@@ -261,7 +261,7 @@ async def async_request_openai_completions(
                        if not chunk_bytes:
                            continue
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
@@ -349,7 +349,7 @@ async def async_request_openai_chat_completions(
                        if not chunk_bytes:
                            continue
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
@@ -389,14 +389,6 @@ async def async_request_openai_chat_completions(
    return output
-# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
-    if text.startswith(prefix):
-        return text[len(prefix):]
-    return text
 def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download

--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -269,10 +269,10 @@ def run_square_bench(args):
 def run_range_bench(args):
-    m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
+    m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
-    m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
+    m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
    m_increment, k_increment, n_increment = \
-        [int(x) for x in args.dim_increment.split(",")]
+        (int(x) for x in args.dim_increment.split(","))
    Ms = list(range(m_start, m_end + 1, m_increment))
    Ks = list(range(k_start, k_end + 1, k_increment))
    Ns = list(range(n_start, n_end + 1, n_increment))

--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -468,7 +468,7 @@ def generate():
    impl_configs = []
    GPTQ_kernel_type_configs = list(
-        (TypeConfig(
+        TypeConfig(
            element_a=element_a,
            element_b=element_b,
            element_b_scale=element_a,
@@ -476,7 +476,7 @@ def generate():
            element_d=element_a,
            accumulator=DataType.f32,
        ) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
-         for element_a in (DataType.f16, DataType.bf16)))
+        for element_a in (DataType.f16, DataType.bf16))
    GPTQ_kernel_specializations = [
        Specialization(with_C=False, with_zeropoints=False, with_scales=True)
@@ -490,7 +490,7 @@ def generate():
    ]
    AWQ_kernel_type_configs = list(
-        (TypeConfig(
+        TypeConfig(
            element_a=element_a,
            element_b=element_b,
            element_b_scale=element_a,
@@ -498,7 +498,7 @@ def generate():
            element_d=element_a,
            accumulator=DataType.f32,
        ) for element_b in (DataType.u4, DataType.u8)
-         for element_a in (DataType.f16, DataType.bf16)))
+        for element_a in (DataType.f16, DataType.bf16))
    AWQ_kernel_specializations = [
        Specialization(with_C=False, with_zeropoints=True, with_scales=True)

--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -10,7 +10,7 @@ Requirements
 ============
 * OS: Linux
-* Python: 3.8 - 3.12
+* Python: 3.9 -- 3.12
 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 Install released versions

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ select = [
    # Pyflakes
    "F",
    # pyupgrade
-    # "UP",
+    "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
@@ -55,7 +55,7 @@ ignore = [
 ]
 [tool.mypy]
-python_version = "3.8"
+python_version = "3.9"
 ignore_missing_imports = true
 check_untyped_defs = true

--- a/setup.py
+++ b/setup.py
 import importlib.util
-import io
 import logging
 import os
 import re
@@ -327,7 +326,7 @@ def get_neuronxcc_version():
                                "__init__.py")
    # Check if the command was executed successfully
-    with open(version_file, "rt") as fp:
+    with open(version_file) as fp:
        content = fp.read()
    # Extract the version using a regular expression
@@ -404,7 +403,8 @@ def read_readme() -> str:
    """Read the README file if present."""
    p = get_path("README.md")
    if os.path.isfile(p):
-        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
+        with open(get_path("README.md"), encoding="utf-8") as f:
+            return f.read()
    else:
        return ""
@@ -498,7 +498,6 @@ setup(
        "Documentation": "https://vllm.readthedocs.io/en/latest/",
    },
    classifiers=[
-        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
@@ -512,7 +511,7 @@ setup(
    ],
    packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
                                    "tests*")),
-    python_requires=">=3.8",
+    python_requires=">=3.9",
    install_requires=get_requirements(),
    ext_modules=ext_modules,
    extras_require={

--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -429,8 +429,8 @@ def benchmark():
    # print in tabular format
    print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
    for b in cudagraph_sizes:
-        print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
+        print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
-               f"\t{piecewise_cudagraph_time[b]:.3f}"))
+              f"\t{piecewise_cudagraph_time[b]:.3f}")
 if __name__ == "__main__":

--- a/tests/conftest.py
+++ b/tests/conftest.py
 import json
 import os
-import sys
 import tempfile
 from collections import UserList
 from enum import Enum
@@ -52,7 +51,7 @@ PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 def _read_prompts(filename: str) -> List[str]:
-    with open(filename, "r") as f:
+    with open(filename) as f:
        prompts = f.readlines()
        return prompts
@@ -62,13 +61,7 @@ class _ImageAssetPrompts(TypedDict):
    cherry_blossom: str
-if sys.version_info < (3, 9):
+class _ImageAssetsBase(UserList[ImageAsset]):
-    # UserList cannot be subscripted
-    class _ImageAssetsBase(UserList):
-        pass
-else:
-    class _ImageAssetsBase(UserList[ImageAsset]):
    pass
@@ -94,13 +87,7 @@ class _VideoAssetPrompts(TypedDict):
    sample_demo_1: str
-if sys.version_info < (3, 9):
+class _VideoAssetsBase(UserList[VideoAsset]):
-    # UserList cannot be subscripted
-    class _VideoAssetsBase(UserList):
-        pass
-else:
-    class _VideoAssetsBase(UserList[VideoAsset]):
    pass
@@ -958,7 +945,7 @@ def dummy_opt_path():
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
            config = json.load(f)
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
@@ -977,7 +964,7 @@ def dummy_llava_path():
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
            config = json.load(f)
        config["architectures"] = ["MyLlava"]
        with open(json_path, "w") as f:
@@ -996,7 +983,7 @@ def dummy_gemma2_embedding_path():
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
            config = json.load(f)
        config["architectures"] = ["MyGemma2Embedding"]
        with open(json_path, "w") as f:

--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -99,13 +99,11 @@ class TestPrefixCachingBlock:
        token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
-        first_chain, second_chain = [
+        first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
-            TestPrefixCachingBlock.create_chain(
            block_size=block_size,
            token_ids=token_ids,
            num_empty_trailing_blocks=num_empty_trailing_blocks)
-            for _ in range(2)
+                                     for _ in range(2))
-        ]
        for first_chain_block, second_chain_block in zip(
                first_chain, second_chain):

--- a/tests/kernels/test_mamba_ssm.py
+++ b/tests/kernels/test_mamba_ssm.py
@@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
        for var in (u_ref, delta_ref, B_ref, C_ref, z_ref)
    ]
    for i in range(len(seqlens[0])):
-        u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits]
+        u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits)
        if padded_state_indices[i] == PAD_SLOT_ID:
            continue
        out_ref_s, _ = selective_scan_ref(

--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
@@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
 # Sad path tests for the multimodal input processor and mapper, respectively
 @pytest.mark.parametrize("mm_data", [
    {
-        "image": torch.rand((5))
+        "image": torch.rand(5)
    },
    {
        "image": torch.rand((5, 5, 5, 5, 5))