Unverified Commit 21063c11 authored by Aaron Pham's avatar Aaron Pham Committed by GitHub
Browse files

[CI/Build] drop support for Python 3.8 EOL (#8464)


Signed-off-by: default avatarAaron Pham <contact@aarnphm.xyz>
parent 4be3a451
...@@ -56,7 +56,7 @@ serving_column_mapping = { ...@@ -56,7 +56,7 @@ serving_column_mapping = {
def read_markdown(file): def read_markdown(file):
if os.path.exists(file): if os.path.exists(file):
with open(file, "r") as f: with open(file) as f:
return f.read() + "\n" return f.read() + "\n"
else: else:
return f"{file} not found.\n" return f"{file} not found.\n"
...@@ -75,14 +75,14 @@ if __name__ == "__main__": ...@@ -75,14 +75,14 @@ if __name__ == "__main__":
# collect results # collect results
for test_file in results_folder.glob("*.json"): for test_file in results_folder.glob("*.json"):
with open(test_file, "r") as f: with open(test_file) as f:
raw_result = json.loads(f.read()) raw_result = json.loads(f.read())
if "serving" in str(test_file): if "serving" in str(test_file):
# this result is generated via `benchmark_serving.py` # this result is generated via `benchmark_serving.py`
# attach the benchmarking command to raw_result # attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f: with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read()) command = json.loads(f.read())
raw_result.update(command) raw_result.update(command)
...@@ -97,7 +97,7 @@ if __name__ == "__main__": ...@@ -97,7 +97,7 @@ if __name__ == "__main__":
# this result is generated via `benchmark_latency.py` # this result is generated via `benchmark_latency.py`
# attach the benchmarking command to raw_result # attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f: with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read()) command = json.loads(f.read())
raw_result.update(command) raw_result.update(command)
...@@ -119,7 +119,7 @@ if __name__ == "__main__": ...@@ -119,7 +119,7 @@ if __name__ == "__main__":
# this result is generated via `benchmark_throughput.py` # this result is generated via `benchmark_throughput.py`
# attach the benchmarking command to raw_result # attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f: with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read()) command = json.loads(f.read())
raw_result.update(command) raw_result.update(command)
......
...@@ -72,7 +72,7 @@ def main(args): ...@@ -72,7 +72,7 @@ def main(args):
# collect results # collect results
for test_file in results_folder.glob("*_nightly_results.json"): for test_file in results_folder.glob("*_nightly_results.json"):
with open(test_file, "r") as f: with open(test_file) as f:
results = results + json.loads(f.read()) results = results + json.loads(f.read())
# generate markdown table # generate markdown table
...@@ -80,7 +80,7 @@ def main(args): ...@@ -80,7 +80,7 @@ def main(args):
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
with open(args.description, "r") as f: with open(args.description) as f:
description = f.read() description = f.read()
description = description.format( description = description.format(
......
...@@ -36,11 +36,11 @@ if __name__ == "__main__": ...@@ -36,11 +36,11 @@ if __name__ == "__main__":
# collect results # collect results
for test_file in results_folder.glob("*.json"): for test_file in results_folder.glob("*.json"):
with open(test_file, "r") as f: with open(test_file) as f:
raw_result = json.loads(f.read()) raw_result = json.loads(f.read())
# attach the benchmarking command to raw_result # attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f: with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read()) command = json.loads(f.read())
raw_result.update(command) raw_result.update(command)
......
...@@ -25,7 +25,7 @@ jobs: ...@@ -25,7 +25,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] python-version: ["3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
......
...@@ -48,7 +48,7 @@ jobs: ...@@ -48,7 +48,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
os: ['ubuntu-20.04'] os: ['ubuntu-20.04']
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] python-version: ['3.9', '3.10', '3.11', '3.12']
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt. pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1'] cuda-version: ['11.8', '12.1']
......
...@@ -29,19 +29,19 @@ jobs: ...@@ -29,19 +29,19 @@ jobs:
matrix: matrix:
python-version: ["3.12"] python-version: ["3.12"]
steps: steps:
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -r requirements-lint.txt pip install -r requirements-lint.txt
- name: Analysing the code with ruff - name: Analysing the code with ruff
run: | run: |
echo "::add-matcher::.github/workflows/matchers/ruff.json" echo "::add-matcher::.github/workflows/matchers/ruff.json"
ruff check --output-format github . ruff check --output-format github .
- name: Run isort - name: Run isort
run: | run: |
isort . --check-only isort . --check-only
...@@ -23,16 +23,16 @@ jobs: ...@@ -23,16 +23,16 @@ jobs:
matrix: matrix:
python-version: ["3.12"] python-version: ["3.12"]
steps: steps:
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install yapf==0.32.0 pip install yapf==0.32.0
pip install toml==0.10.2 pip install toml==0.10.2
- name: Running yapf - name: Running yapf
run: | run: |
yapf --diff --recursive . yapf --diff --recursive .
...@@ -6,17 +6,16 @@ version: 2 ...@@ -6,17 +6,16 @@ version: 2
build: build:
os: ubuntu-22.04 os: ubuntu-22.04
tools: tools:
python: "3.8" python: '3.9'
sphinx: sphinx:
configuration: docs/source/conf.py configuration: docs/source/conf.py
fail_on_warning: true fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF # If using Sphinx, optionally build your docs in additional formats such as PDF
formats: [] formats: []
# Optionally declare the Python requirements required to build your docs # Optionally declare the Python requirements required to build your docs
python: python:
install: install:
- requirements: docs/requirements-docs.txt - requirements: docs/requirements-docs.txt
...@@ -128,9 +128,9 @@ endif() ...@@ -128,9 +128,9 @@ endif()
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
# #
# For cuda we want to be able to control which architectures we compile for on # For cuda we want to be able to control which architectures we compile for on
# a per-file basis in order to cut down on compile time. So here we extract # a per-file basis in order to cut down on compile time. So here we extract
# the set of architectures we want to compile for and remove the from the # the set of architectures we want to compile for and remove the from the
# CMAKE_CUDA_FLAGS so that they are not applied globally. # CMAKE_CUDA_FLAGS so that they are not applied globally.
# #
clear_cuda_arches(CUDA_ARCH_FLAGS) clear_cuda_arches(CUDA_ARCH_FLAGS)
...@@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}") message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs # Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS. # since for some files we will build for all CUDA_ARCHS.
cuda_archs_loose_intersection(CUDA_ARCHS cuda_archs_loose_intersection(CUDA_ARCHS
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}") "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}") message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
else() else()
...@@ -236,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -236,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# are not supported by Machete yet. # are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS}) cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
if (MARLIN_ARCHS) if (MARLIN_ARCHS)
set(MARLIN_SRCS set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu" "csrc/quantization/fp8/fp8_marlin.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
...@@ -277,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -277,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"in CUDA target architectures") "in CUDA target architectures")
endif() endif()
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels # build any 3x kernels
set(SCALED_MM_3X_ARCHS) set(SCALED_MM_3X_ARCHS)
endif() endif()
...@@ -285,7 +285,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -285,7 +285,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# #
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x. # kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x # subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
...@@ -316,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -316,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
# #
# For the Machete kernels we automatically generate sources for various # For the Machete kernels we automatically generate sources for various
# preselected input type pairs and schedules. # preselected input type pairs and schedules.
# Generate sources: # Generate sources:
set(MACHETE_GEN_SCRIPT set(MACHETE_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py) ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH) file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
...@@ -329,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -329,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH} if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH}) OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process( execute_process(
COMMAND ${CMAKE_COMMAND} -E env COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT} ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output OUTPUT_VARIABLE machete_generation_output
...@@ -340,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -340,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if (NOT machete_generation_result EQUAL 0) if (NOT machete_generation_result EQUAL 0)
message(FATAL_ERROR "Machete generation failed." message(FATAL_ERROR "Machete generation failed."
" Result: \"${machete_generation_result}\"" " Result: \"${machete_generation_result}\""
"\nCheck the log for details: " "\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log") "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
else() else()
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
CACHE STRING "Last run machete generate script hash" FORCE) CACHE STRING "Last run machete generate script hash" FORCE)
message(STATUS "Machete generation completed successfully.") message(STATUS "Machete generation completed successfully.")
endif() endif()
...@@ -366,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -366,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}") message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
else() else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
AND MACHETE_ARCHS) AND MACHETE_ARCHS)
message(STATUS "Not building Machete kernels as CUDA Compiler version is " message(STATUS "Not building Machete kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or " "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
...@@ -392,8 +392,8 @@ define_gpu_extension_target( ...@@ -392,8 +392,8 @@ define_gpu_extension_target(
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses # If CUTLASS is compiled on NVCC >= 12.5, it by default uses
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the # cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
# driver API. This causes problems when linking with earlier versions of CUDA. # driver API. This causes problems when linking with earlier versions of CUDA.
# Setting this variable sidesteps the issue by calling the driver directly. # Setting this variable sidesteps the issue by calling the driver directly.
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
...@@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda") ...@@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
return() return()
endif () endif ()
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target # vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the # arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis) # arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here. # we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS}) foreach(_ARCH ${CUDA_ARCHS})
......
...@@ -79,7 +79,7 @@ async def async_request_tgi( ...@@ -79,7 +79,7 @@ async def async_request_tgi(
# any data, we should skip it. # any data, we should skip it.
if chunk_bytes.startswith(":"): if chunk_bytes.startswith(":"):
continue continue
chunk = remove_prefix(chunk_bytes, "data:") chunk = chunk_bytes.removeprefix("data:")
data = json.loads(chunk) data = json.loads(chunk)
timestamp = time.perf_counter() timestamp = time.perf_counter()
...@@ -144,8 +144,8 @@ async def async_request_trt_llm( ...@@ -144,8 +144,8 @@ async def async_request_trt_llm(
if not chunk_bytes: if not chunk_bytes:
continue continue
chunk = remove_prefix(chunk_bytes.decode("utf-8"), chunk = chunk_bytes.decode("utf-8").removeprefix(
"data:") "data:")
data = json.loads(chunk) data = json.loads(chunk)
output.generated_text += data["text_output"] output.generated_text += data["text_output"]
...@@ -261,8 +261,8 @@ async def async_request_openai_completions( ...@@ -261,8 +261,8 @@ async def async_request_openai_completions(
if not chunk_bytes: if not chunk_bytes:
continue continue
chunk = remove_prefix(chunk_bytes.decode("utf-8"), chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ") "data: ")
if chunk == "[DONE]": if chunk == "[DONE]":
latency = time.perf_counter() - st latency = time.perf_counter() - st
else: else:
...@@ -349,8 +349,8 @@ async def async_request_openai_chat_completions( ...@@ -349,8 +349,8 @@ async def async_request_openai_chat_completions(
if not chunk_bytes: if not chunk_bytes:
continue continue
chunk = remove_prefix(chunk_bytes.decode("utf-8"), chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ") "data: ")
if chunk == "[DONE]": if chunk == "[DONE]":
latency = time.perf_counter() - st latency = time.perf_counter() - st
else: else:
...@@ -389,14 +389,6 @@ async def async_request_openai_chat_completions( ...@@ -389,14 +389,6 @@ async def async_request_openai_chat_completions(
return output return output
# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
# introduced in Python 3.9
def remove_prefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix):]
return text
def get_model(pretrained_model_name_or_path: str) -> str: def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download from modelscope import snapshot_download
......
...@@ -269,10 +269,10 @@ def run_square_bench(args): ...@@ -269,10 +269,10 @@ def run_square_bench(args):
def run_range_bench(args): def run_range_bench(args):
m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")] m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")] m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
m_increment, k_increment, n_increment = \ m_increment, k_increment, n_increment = \
[int(x) for x in args.dim_increment.split(",")] (int(x) for x in args.dim_increment.split(","))
Ms = list(range(m_start, m_end + 1, m_increment)) Ms = list(range(m_start, m_end + 1, m_increment))
Ks = list(range(k_start, k_end + 1, k_increment)) Ks = list(range(k_start, k_end + 1, k_increment))
Ns = list(range(n_start, n_end + 1, n_increment)) Ns = list(range(n_start, n_end + 1, n_increment))
......
...@@ -468,7 +468,7 @@ def generate(): ...@@ -468,7 +468,7 @@ def generate():
impl_configs = [] impl_configs = []
GPTQ_kernel_type_configs = list( GPTQ_kernel_type_configs = list(
(TypeConfig( TypeConfig(
element_a=element_a, element_a=element_a,
element_b=element_b, element_b=element_b,
element_b_scale=element_a, element_b_scale=element_a,
...@@ -476,7 +476,7 @@ def generate(): ...@@ -476,7 +476,7 @@ def generate():
element_d=element_a, element_d=element_a,
accumulator=DataType.f32, accumulator=DataType.f32,
) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128) ) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
for element_a in (DataType.f16, DataType.bf16))) for element_a in (DataType.f16, DataType.bf16))
GPTQ_kernel_specializations = [ GPTQ_kernel_specializations = [
Specialization(with_C=False, with_zeropoints=False, with_scales=True) Specialization(with_C=False, with_zeropoints=False, with_scales=True)
...@@ -490,7 +490,7 @@ def generate(): ...@@ -490,7 +490,7 @@ def generate():
] ]
AWQ_kernel_type_configs = list( AWQ_kernel_type_configs = list(
(TypeConfig( TypeConfig(
element_a=element_a, element_a=element_a,
element_b=element_b, element_b=element_b,
element_b_scale=element_a, element_b_scale=element_a,
...@@ -498,7 +498,7 @@ def generate(): ...@@ -498,7 +498,7 @@ def generate():
element_d=element_a, element_d=element_a,
accumulator=DataType.f32, accumulator=DataType.f32,
) for element_b in (DataType.u4, DataType.u8) ) for element_b in (DataType.u4, DataType.u8)
for element_a in (DataType.f16, DataType.bf16))) for element_a in (DataType.f16, DataType.bf16))
AWQ_kernel_specializations = [ AWQ_kernel_specializations = [
Specialization(with_C=False, with_zeropoints=True, with_scales=True) Specialization(with_C=False, with_zeropoints=True, with_scales=True)
......
...@@ -10,7 +10,7 @@ Requirements ...@@ -10,7 +10,7 @@ Requirements
============ ============
* OS: Linux * OS: Linux
* Python: 3.8 - 3.12 * Python: 3.9 -- 3.12
* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
Install released versions Install released versions
...@@ -148,7 +148,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T ...@@ -148,7 +148,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
.. tip:: .. tip::
Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` . For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
...@@ -181,8 +181,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example ...@@ -181,8 +181,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example
$ export MAX_JOBS=6 $ export MAX_JOBS=6
$ pip install -e . $ pip install -e .
This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
A side effect is a much slower build process. A side effect is a much slower build process.
Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
...@@ -209,7 +209,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed: ...@@ -209,7 +209,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
Unsupported OS build Unsupported OS build
-------------------- --------------------
vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing: Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:
......
...@@ -34,7 +34,7 @@ select = [ ...@@ -34,7 +34,7 @@ select = [
# Pyflakes # Pyflakes
"F", "F",
# pyupgrade # pyupgrade
# "UP", "UP",
# flake8-bugbear # flake8-bugbear
"B", "B",
# flake8-simplify # flake8-simplify
...@@ -55,7 +55,7 @@ ignore = [ ...@@ -55,7 +55,7 @@ ignore = [
] ]
[tool.mypy] [tool.mypy]
python_version = "3.8" python_version = "3.9"
ignore_missing_imports = true ignore_missing_imports = true
check_untyped_defs = true check_untyped_defs = true
......
import importlib.util import importlib.util
import io
import logging import logging
import os import os
import re import re
...@@ -327,7 +326,7 @@ def get_neuronxcc_version(): ...@@ -327,7 +326,7 @@ def get_neuronxcc_version():
"__init__.py") "__init__.py")
# Check if the command was executed successfully # Check if the command was executed successfully
with open(version_file, "rt") as fp: with open(version_file) as fp:
content = fp.read() content = fp.read()
# Extract the version using a regular expression # Extract the version using a regular expression
...@@ -404,7 +403,8 @@ def read_readme() -> str: ...@@ -404,7 +403,8 @@ def read_readme() -> str:
"""Read the README file if present.""" """Read the README file if present."""
p = get_path("README.md") p = get_path("README.md")
if os.path.isfile(p): if os.path.isfile(p):
return io.open(get_path("README.md"), "r", encoding="utf-8").read() with open(get_path("README.md"), encoding="utf-8") as f:
return f.read()
else: else:
return "" return ""
...@@ -498,7 +498,6 @@ setup( ...@@ -498,7 +498,6 @@ setup(
"Documentation": "https://vllm.readthedocs.io/en/latest/", "Documentation": "https://vllm.readthedocs.io/en/latest/",
}, },
classifiers=[ classifiers=[
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.11",
...@@ -512,7 +511,7 @@ setup( ...@@ -512,7 +511,7 @@ setup(
], ],
packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples", packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
"tests*")), "tests*")),
python_requires=">=3.8", python_requires=">=3.9",
install_requires=get_requirements(), install_requires=get_requirements(),
ext_modules=ext_modules, ext_modules=ext_modules,
extras_require={ extras_require={
......
...@@ -429,8 +429,8 @@ def benchmark(): ...@@ -429,8 +429,8 @@ def benchmark():
# print in tabular format # print in tabular format
print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph") print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
for b in cudagraph_sizes: for b in cudagraph_sizes:
print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}" print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
f"\t{piecewise_cudagraph_time[b]:.3f}")) f"\t{piecewise_cudagraph_time[b]:.3f}")
if __name__ == "__main__": if __name__ == "__main__":
......
import json import json
import os import os
import sys
import tempfile import tempfile
from collections import UserList from collections import UserList
from enum import Enum from enum import Enum
...@@ -52,7 +51,7 @@ PromptVideoInput = _PromptMultiModalInput[np.ndarray] ...@@ -52,7 +51,7 @@ PromptVideoInput = _PromptMultiModalInput[np.ndarray]
def _read_prompts(filename: str) -> List[str]: def _read_prompts(filename: str) -> List[str]:
with open(filename, "r") as f: with open(filename) as f:
prompts = f.readlines() prompts = f.readlines()
return prompts return prompts
...@@ -62,14 +61,8 @@ class _ImageAssetPrompts(TypedDict): ...@@ -62,14 +61,8 @@ class _ImageAssetPrompts(TypedDict):
cherry_blossom: str cherry_blossom: str
if sys.version_info < (3, 9): class _ImageAssetsBase(UserList[ImageAsset]):
# UserList cannot be subscripted pass
class _ImageAssetsBase(UserList):
pass
else:
class _ImageAssetsBase(UserList[ImageAsset]):
pass
class _ImageAssets(_ImageAssetsBase): class _ImageAssets(_ImageAssetsBase):
...@@ -94,14 +87,8 @@ class _VideoAssetPrompts(TypedDict): ...@@ -94,14 +87,8 @@ class _VideoAssetPrompts(TypedDict):
sample_demo_1: str sample_demo_1: str
if sys.version_info < (3, 9): class _VideoAssetsBase(UserList[VideoAsset]):
# UserList cannot be subscripted pass
class _VideoAssetsBase(UserList):
pass
else:
class _VideoAssetsBase(UserList[VideoAsset]):
pass
class _VideoAssets(_VideoAssetsBase): class _VideoAssets(_VideoAssetsBase):
...@@ -958,7 +945,7 @@ def dummy_opt_path(): ...@@ -958,7 +945,7 @@ def dummy_opt_path():
"*.msgpack" "*.msgpack"
]) ])
assert os.path.exists(json_path) assert os.path.exists(json_path)
with open(json_path, "r") as f: with open(json_path) as f:
config = json.load(f) config = json.load(f)
config["architectures"] = ["MyOPTForCausalLM"] config["architectures"] = ["MyOPTForCausalLM"]
with open(json_path, "w") as f: with open(json_path, "w") as f:
...@@ -977,7 +964,7 @@ def dummy_llava_path(): ...@@ -977,7 +964,7 @@ def dummy_llava_path():
"*.msgpack" "*.msgpack"
]) ])
assert os.path.exists(json_path) assert os.path.exists(json_path)
with open(json_path, "r") as f: with open(json_path) as f:
config = json.load(f) config = json.load(f)
config["architectures"] = ["MyLlava"] config["architectures"] = ["MyLlava"]
with open(json_path, "w") as f: with open(json_path, "w") as f:
...@@ -996,7 +983,7 @@ def dummy_gemma2_embedding_path(): ...@@ -996,7 +983,7 @@ def dummy_gemma2_embedding_path():
"*.msgpack" "*.msgpack"
]) ])
assert os.path.exists(json_path) assert os.path.exists(json_path)
with open(json_path, "r") as f: with open(json_path) as f:
config = json.load(f) config = json.load(f)
config["architectures"] = ["MyGemma2Embedding"] config["architectures"] = ["MyGemma2Embedding"]
with open(json_path, "w") as f: with open(json_path, "w") as f:
......
...@@ -99,13 +99,11 @@ class TestPrefixCachingBlock: ...@@ -99,13 +99,11 @@ class TestPrefixCachingBlock:
token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)] token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
first_chain, second_chain = [ first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
TestPrefixCachingBlock.create_chain( block_size=block_size,
block_size=block_size, token_ids=token_ids,
token_ids=token_ids, num_empty_trailing_blocks=num_empty_trailing_blocks)
num_empty_trailing_blocks=num_empty_trailing_blocks) for _ in range(2))
for _ in range(2)
]
for first_chain_block, second_chain_block in zip( for first_chain_block, second_chain_block in zip(
first_chain, second_chain): first_chain, second_chain):
......
...@@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C, ...@@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
for var in (u_ref, delta_ref, B_ref, C_ref, z_ref) for var in (u_ref, delta_ref, B_ref, C_ref, z_ref)
] ]
for i in range(len(seqlens[0])): for i in range(len(seqlens[0])):
u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits] u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits)
if padded_state_indices[i] == PAD_SLOT_ID: if padded_state_indices[i] == PAD_SLOT_ID:
continue continue
out_ref_s, _ = selective_scan_ref( out_ref_s, _ = selective_scan_ref(
......
...@@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen, ...@@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
# Sad path tests for the multimodal input processor and mapper, respectively # Sad path tests for the multimodal input processor and mapper, respectively
@pytest.mark.parametrize("mm_data", [ @pytest.mark.parametrize("mm_data", [
{ {
"image": torch.rand((5)) "image": torch.rand(5)
}, },
{ {
"image": torch.rand((5, 5, 5, 5, 5)) "image": torch.rand((5, 5, 5, 5, 5))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment