[CI/Build] drop support for Python 3.8 EOL (#8464)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

[CI/Build] drop support for Python 3.8 EOL (#8464)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
21063c11 · Aaron Pham · GitHub · 4be3a451 · 21063c11 · 21063c11
Unverified Commit 21063c11 authored Nov 06, 2024 by Aaron Pham Committed by GitHub Nov 06, 2024
20 changed files
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -56,7 +56,7 @@ serving_column_mapping = {
 def read_markdown(file):
    if os.path.exists(file):
-        with open(file, "r") as f:
+        with open(file) as f:
            return f.read() + "\n"
    else:
        return f"{file} not found.\n"
@@ -75,14 +75,14 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            raw_result = json.loads(f.read())
        if "serving" in str(test_file):
            # this result is generated via `benchmark_serving.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)
@@ -97,7 +97,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_latency.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)
@@ -119,7 +119,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_throughput.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)

--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -72,7 +72,7 @@ def main(args):
    # collect results
    for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            results = results + json.loads(f.read())
    # generate markdown table
@@ -80,7 +80,7 @@ def main(args):
    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
-    with open(args.description, "r") as f:
+    with open(args.description) as f:
        description = f.read()
    description = description.format(

--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -36,11 +36,11 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            raw_result = json.loads(f.read())
        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
+        with open(test_file.with_suffix(".commands")) as f:
            command = json.loads(f.read())
        raw_result.update(command)

--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -25,7 +25,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
    steps:
    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
    - name: Set up Python ${{ matrix.python-version }}

--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -48,7 +48,7 @@ jobs:
      fail-fast: false
      matrix:
          os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          python-version: ['3.9', '3.10', '3.11', '3.12']
          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
          cuda-version: ['11.8', '12.1']

--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -29,19 +29,19 @@ jobs:
      matrix:
        python-version: ["3.12"]
    steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-    - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-      with:
+        with:
-        python-version: ${{ matrix.python-version }}
+          python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
+      - name: Install dependencies
-      run: |
+        run: |
-        python -m pip install --upgrade pip
+          python -m pip install --upgrade pip
-        pip install -r requirements-lint.txt
+          pip install -r requirements-lint.txt
-    - name: Analysing the code with ruff
+      - name: Analysing the code with ruff
-      run: |
+        run: |
-        echo "::add-matcher::.github/workflows/matchers/ruff.json"
+          echo "::add-matcher::.github/workflows/matchers/ruff.json"
-        ruff check --output-format github .
+          ruff check --output-format github .
-    - name: Run isort
+      - name: Run isort
-      run: |
+        run: |
-        isort . --check-only
+          isort . --check-only
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -23,16 +23,16 @@ jobs:
      matrix:
        python-version: ["3.12"]
    steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-    - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-      with:
+        with:
-        python-version: ${{ matrix.python-version }}
+          python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
+      - name: Install dependencies
-      run: |
+        run: |
-        python -m pip install --upgrade pip
+          python -m pip install --upgrade pip
-        pip install yapf==0.32.0
+          pip install yapf==0.32.0
-        pip install toml==0.10.2
+          pip install toml==0.10.2
-    - name: Running yapf
+      - name: Running yapf
-      run: |
+        run: |
-        yapf --diff --recursive .
+          yapf --diff --recursive .
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -6,17 +6,16 @@ version: 2
 build:
  os: ubuntu-22.04
  tools:
-    python: "3.8"
+    python: '3.9'
 sphinx:
-   configuration: docs/source/conf.py
+  configuration: docs/source/conf.py
-   fail_on_warning: true
+  fail_on_warning: true
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats: []
 # Optionally declare the Python requirements required to build your docs
 python:
-   install:
+  install:
-   - requirements: docs/requirements-docs.txt
+    - requirements: docs/requirements-docs.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,9 +128,9 @@ endif()
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  #
-  # For cuda we want to be able to control which architectures we compile for on 
+  # For cuda we want to be able to control which architectures we compile for on
  # a per-file basis in order to cut down on compile time. So here we extract
-  # the set of architectures we want to compile for and remove the from the 
+  # the set of architectures we want to compile for and remove the from the
  # CMAKE_CUDA_FLAGS so that they are not applied globally.
  #
  clear_cuda_arches(CUDA_ARCH_FLAGS)
@@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
  # Filter the target architectures by the supported supported archs
  # since for some files we will build for all CUDA_ARCHS.
-  cuda_archs_loose_intersection(CUDA_ARCHS 
+  cuda_archs_loose_intersection(CUDA_ARCHS
    "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
 else()
@@ -236,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # are not supported by Machete yet.
  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
  if (MARLIN_ARCHS)
-    set(MARLIN_SRCS 
+    set(MARLIN_SRCS
       "csrc/quantization/fp8/fp8_marlin.cu"
       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
@@ -277,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                     "in CUDA target architectures")
    endif()
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
    # build any 3x kernels
    set(SCALED_MM_3X_ARCHS)
  endif()
@@ -285,7 +285,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  #
  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
  # kernels for the remaining archs that are not already built for 3x.
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
    "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
@@ -316,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
    #
-    # For the Machete kernels we automatically generate sources for various 
+    # For the Machete kernels we automatically generate sources for various
    # preselected input type pairs and schedules.
    # Generate sources:
-    set(MACHETE_GEN_SCRIPT 
+    set(MACHETE_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
    file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
@@ -329,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
        OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
      execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env 
+        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH 
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
          ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
        RESULT_VARIABLE machete_generation_result
        OUTPUT_VARIABLE machete_generation_output
@@ -340,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      if (NOT machete_generation_result EQUAL 0)
        message(FATAL_ERROR "Machete generation failed."
-                            " Result: \"${machete_generation_result}\"" 
+                            " Result: \"${machete_generation_result}\""
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
      else()
-        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} 
+        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
            CACHE STRING "Last run machete generate script hash" FORCE)
        message(STATUS "Machete generation completed successfully.")
      endif()
@@ -366,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
        AND MACHETE_ARCHS)
      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@@ -392,8 +392,8 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)
-# If CUTLASS is compiled on NVCC >= 12.5, it by default uses 
+# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
-# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the 
+# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
 # driver API. This causes problems when linking with earlier versions of CUDA.
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
@@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
  return()
 endif ()
-# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target  
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
-# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the 
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
-# arches in the CUDA case (and instead set the gencodes on a per file basis) 
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
 # we need to manually set VLLM_GPU_ARCHES here.
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  foreach(_ARCH ${CUDA_ARCHS})

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -79,7 +79,7 @@ async def async_request_tgi(
                        # any data, we should skip it.
                        if chunk_bytes.startswith(":"):
                            continue
-                        chunk = remove_prefix(chunk_bytes, "data:")
+                        chunk = chunk_bytes.removeprefix("data:")
                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
@@ -144,8 +144,8 @@ async def async_request_trt_llm(
                        if not chunk_bytes:
                            continue
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                                              "data:")
+                            "data:")
                        data = json.loads(chunk)
                        output.generated_text += data["text_output"]
@@ -261,8 +261,8 @@ async def async_request_openai_completions(
                        if not chunk_bytes:
                            continue
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                                              "data: ")
+                            "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
                        else:
@@ -349,8 +349,8 @@ async def async_request_openai_chat_completions(
                        if not chunk_bytes:
                            continue
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                                              "data: ")
+                            "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
                        else:
@@ -389,14 +389,6 @@ async def async_request_openai_chat_completions(
    return output
-# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
-    if text.startswith(prefix):
-        return text[len(prefix):]
-    return text
 def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download

--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -269,10 +269,10 @@ def run_square_bench(args):
 def run_range_bench(args):
-    m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
+    m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
-    m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
+    m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
    m_increment, k_increment, n_increment = \
-        [int(x) for x in args.dim_increment.split(",")]
+        (int(x) for x in args.dim_increment.split(","))
    Ms = list(range(m_start, m_end + 1, m_increment))
    Ks = list(range(k_start, k_end + 1, k_increment))
    Ns = list(range(n_start, n_end + 1, n_increment))

--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -468,7 +468,7 @@ def generate():
    impl_configs = []
    GPTQ_kernel_type_configs = list(
-        (TypeConfig(
+        TypeConfig(
            element_a=element_a,
            element_b=element_b,
            element_b_scale=element_a,
@@ -476,7 +476,7 @@ def generate():
            element_d=element_a,
            accumulator=DataType.f32,
        ) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
-         for element_a in (DataType.f16, DataType.bf16)))
+        for element_a in (DataType.f16, DataType.bf16))
    GPTQ_kernel_specializations = [
        Specialization(with_C=False, with_zeropoints=False, with_scales=True)
@@ -490,7 +490,7 @@ def generate():
    ]
    AWQ_kernel_type_configs = list(
-        (TypeConfig(
+        TypeConfig(
            element_a=element_a,
            element_b=element_b,
            element_b_scale=element_a,
@@ -498,7 +498,7 @@ def generate():
            element_d=element_a,
            accumulator=DataType.f32,
        ) for element_b in (DataType.u4, DataType.u8)
-         for element_a in (DataType.f16, DataType.bf16)))
+        for element_a in (DataType.f16, DataType.bf16))
    AWQ_kernel_specializations = [
        Specialization(with_C=False, with_zeropoints=True, with_scales=True)

--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -10,7 +10,7 @@ Requirements
 ============
 * OS: Linux
-* Python: 3.8 - 3.12
+* Python: 3.9 -- 3.12
 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 Install released versions
@@ -148,7 +148,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
 .. tip::
    Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
-    For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` . 
+    For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
    As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
@@ -181,8 +181,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example
    $ export MAX_JOBS=6
    $ pip install -e .
-This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. 
+This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
-A side effect is a much slower build process. 
+A side effect is a much slower build process.
 Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
@@ -209,7 +209,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
 Unsupported OS build
 --------------------
-vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. 
+vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
 Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ select = [
    # Pyflakes
    "F",
    # pyupgrade
-    # "UP",
+    "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
@@ -55,7 +55,7 @@ ignore = [
 ]
 [tool.mypy]
-python_version = "3.8"
+python_version = "3.9"
 ignore_missing_imports = true
 check_untyped_defs = true

--- a/setup.py
+++ b/setup.py
 import importlib.util
-import io
 import logging
 import os
 import re
@@ -327,7 +326,7 @@ def get_neuronxcc_version():
                                "__init__.py")
    # Check if the command was executed successfully
-    with open(version_file, "rt") as fp:
+    with open(version_file) as fp:
        content = fp.read()
    # Extract the version using a regular expression
@@ -404,7 +403,8 @@ def read_readme() -> str:
    """Read the README file if present."""
    p = get_path("README.md")
    if os.path.isfile(p):
-        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
+        with open(get_path("README.md"), encoding="utf-8") as f:
+            return f.read()
    else:
        return ""
@@ -498,7 +498,6 @@ setup(
        "Documentation": "https://vllm.readthedocs.io/en/latest/",
    },
    classifiers=[
-        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
@@ -512,7 +511,7 @@ setup(
    ],
    packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
                                    "tests*")),
-    python_requires=">=3.8",
+    python_requires=">=3.9",
    install_requires=get_requirements(),
    ext_modules=ext_modules,
    extras_require={

--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -429,8 +429,8 @@ def benchmark():
    # print in tabular format
    print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
    for b in cudagraph_sizes:
-        print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
+        print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
-               f"\t{piecewise_cudagraph_time[b]:.3f}"))
+              f"\t{piecewise_cudagraph_time[b]:.3f}")
 if __name__ == "__main__":

--- a/tests/conftest.py
+++ b/tests/conftest.py
 import json
 import os
-import sys
 import tempfile
 from collections import UserList
 from enum import Enum
@@ -52,7 +51,7 @@ PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 def _read_prompts(filename: str) -> List[str]:
-    with open(filename, "r") as f:
+    with open(filename) as f:
        prompts = f.readlines()
        return prompts
@@ -62,14 +61,8 @@ class _ImageAssetPrompts(TypedDict):
    cherry_blossom: str
-if sys.version_info < (3, 9):
+class _ImageAssetsBase(UserList[ImageAsset]):
-    # UserList cannot be subscripted
+    pass
-    class _ImageAssetsBase(UserList):
-        pass
-else:
-    class _ImageAssetsBase(UserList[ImageAsset]):
-        pass
 class _ImageAssets(_ImageAssetsBase):
@@ -94,14 +87,8 @@ class _VideoAssetPrompts(TypedDict):
    sample_demo_1: str
-if sys.version_info < (3, 9):
+class _VideoAssetsBase(UserList[VideoAsset]):
-    # UserList cannot be subscripted
+    pass
-    class _VideoAssetsBase(UserList):
-        pass
-else:
-    class _VideoAssetsBase(UserList[VideoAsset]):
-        pass
 class _VideoAssets(_VideoAssetsBase):
@@ -958,7 +945,7 @@ def dummy_opt_path():
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
            config = json.load(f)
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
@@ -977,7 +964,7 @@ def dummy_llava_path():
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
            config = json.load(f)
        config["architectures"] = ["MyLlava"]
        with open(json_path, "w") as f:
@@ -996,7 +983,7 @@ def dummy_gemma2_embedding_path():
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
            config = json.load(f)
        config["architectures"] = ["MyGemma2Embedding"]
        with open(json_path, "w") as f:

--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -99,13 +99,11 @@ class TestPrefixCachingBlock:
        token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
-        first_chain, second_chain = [
+        first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
-            TestPrefixCachingBlock.create_chain(
+            block_size=block_size,
-                block_size=block_size,
+            token_ids=token_ids,
-                token_ids=token_ids,
+            num_empty_trailing_blocks=num_empty_trailing_blocks)
-                num_empty_trailing_blocks=num_empty_trailing_blocks)
+                                     for _ in range(2))
-            for _ in range(2)
-        ]
        for first_chain_block, second_chain_block in zip(
                first_chain, second_chain):

--- a/tests/kernels/test_mamba_ssm.py
+++ b/tests/kernels/test_mamba_ssm.py
@@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
        for var in (u_ref, delta_ref, B_ref, C_ref, z_ref)
    ]
    for i in range(len(seqlens[0])):
-        u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits]
+        u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits)
        if padded_state_indices[i] == PAD_SLOT_ID:
            continue
        out_ref_s, _ = selective_scan_ref(

--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
@@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
 # Sad path tests for the multimodal input processor and mapper, respectively
 @pytest.mark.parametrize("mm_data", [
    {
-        "image": torch.rand((5))
+        "image": torch.rand(5)
    },
    {
        "image": torch.rand((5, 5, 5, 5, 5))