Unverified Commit 9a44b643 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Fix CI (#9012)

parent 41d71ca4
...@@ -30,13 +30,19 @@ jobs: ...@@ -30,13 +30,19 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
pip install "vllm==0.9.0" pip install "vllm==0.10.0"
pip install "bitsandbytes>=0.44.0"
pip install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
pip install "openai==1.99.1" pip install "openai==1.99.1"
pip install "bitsandbytes>=0.44.0"
# NOTE: The latest sgl-kernel depends on torch 2.8.0 but the latest vllm depends on torch 2.7.0
# so they are not compatible. Here we install the old sgl-kernel to make the test pass.
# TODO: remove this once vllm supports torch 2.8.0.
pip install "sgl-kernel==0.2.9"
- name: Run vLLM dependency tests - name: Run vLLM dependency tests
timeout-minutes: 60 timeout-minutes: 60
run: | run: |
export SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK=1
cd test/srt cd test/srt
python3 run_suite.py --suite vllm_dependency_test --timeout-per-file 3600 python3 run_suite.py --suite vllm_dependency_test --timeout-per-file 3600
...@@ -67,6 +67,7 @@ from sglang.srt.utils import ( ...@@ -67,6 +67,7 @@ from sglang.srt.utils import (
MultiprocessingSerializer, MultiprocessingSerializer,
assert_pkg_version, assert_pkg_version,
configure_logger, configure_logger,
get_bool_env_var,
get_zmq_socket, get_zmq_socket,
is_cuda, is_cuda,
kill_process_tree, kill_process_tree,
...@@ -627,7 +628,6 @@ def _set_envs_and_config(server_args: ServerArgs): ...@@ -627,7 +628,6 @@ def _set_envs_and_config(server_args: ServerArgs):
os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem)) os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
if not server_args.enable_symm_mem: if not server_args.enable_symm_mem:
os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls)) os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4" os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
os.environ["CUDA_MODULE_LOADING"] = "AUTO" os.environ["CUDA_MODULE_LOADING"] = "AUTO"
...@@ -647,7 +647,7 @@ def _set_envs_and_config(server_args: ServerArgs): ...@@ -647,7 +647,7 @@ def _set_envs_and_config(server_args: ServerArgs):
"reinstall the latest version by following the instructions " "reinstall the latest version by following the instructions "
"at https://docs.flashinfer.ai/installation.html.", "at https://docs.flashinfer.ai/installation.html.",
) )
if _is_cuda: if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
assert_pkg_version( assert_pkg_version(
"sgl-kernel", "sgl-kernel",
"0.3.3", "0.3.3",
......
...@@ -5,16 +5,17 @@ from abc import ABC, abstractmethod ...@@ -5,16 +5,17 @@ from abc import ABC, abstractmethod
from contextlib import AbstractAsyncContextManager, asynccontextmanager from contextlib import AbstractAsyncContextManager, asynccontextmanager
from typing import Any from typing import Any
logger = logging.getLogger(__name__)
try: try:
from mcp import ClientSession from mcp import ClientSession
from mcp.client.sse import sse_client from mcp.client.sse import sse_client
from mcp.types import ListToolsResult from mcp.types import ListToolsResult
except ImportError: except ImportError as e:
logger.warning("Ignoring mcp import error") ClientSession = sse_client = ListToolsResult = e
from openai_harmony import ToolDescription, ToolNamespaceConfig from openai_harmony import ToolDescription, ToolNamespaceConfig
logger = logging.getLogger(__name__)
async def list_server_and_tools(server_url: str): async def list_server_and_tools(server_url: str):
......
...@@ -147,6 +147,7 @@ class FusedMoE(torch.nn.Module): ...@@ -147,6 +147,7 @@ class FusedMoE(torch.nn.Module):
self.layer_id = layer_id self.layer_id = layer_id
self.top_k = top_k self.top_k = top_k
self.hidden_size = hidden_size
self.num_experts = num_experts self.num_experts = num_experts
self.num_fused_shared_experts = num_fused_shared_experts self.num_fused_shared_experts = num_fused_shared_experts
self.expert_map_cpu = None self.expert_map_cpu = None
......
...@@ -26,8 +26,9 @@ try: ...@@ -26,8 +26,9 @@ try:
from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
VLLM_AVAILABLE = True VLLM_AVAILABLE = True
except ImportError: except ImportError as e:
VLLM_AVAILABLE = False VLLM_AVAILABLE = False
VLLM_IMPORT_ERROR = e
# Define empty classes as placeholders when vllm is not available # Define empty classes as placeholders when vllm is not available
class DummyConfig: class DummyConfig:
...@@ -137,7 +138,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: ...@@ -137,7 +138,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE: if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
raise ValueError( raise ValueError(
f"{quantization} quantization requires some operators from vllm. " f"{quantization} quantization requires some operators from vllm. "
"Please install vllm by `pip install vllm==0.9.0.1`" f"Please install vllm by `pip install vllm==0.9.0.1`\n"
f"Import error: {VLLM_IMPORT_ERROR}"
) )
return QUANTIZATION_METHODS[quantization] return QUANTIZATION_METHODS[quantization]
......
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
from __future__ import annotations from __future__ import annotations
import importlib.util
import logging import logging
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union from typing import TYPE_CHECKING, Any, Dict, List, Optional
import torch import torch
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
...@@ -42,11 +41,7 @@ if is_cuda(): ...@@ -42,11 +41,7 @@ if is_cuda():
try: try:
from flashinfer import mm_fp4 as fp4_gemm from flashinfer import mm_fp4 as fp4_gemm
from flashinfer import ( from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_sf_a
reorder_rows_for_gated_act_gemm,
shuffle_matrix_a,
shuffle_matrix_sf_a,
)
enable_flashinfer_fp4_gemm = True enable_flashinfer_fp4_gemm = True
except ImportError: except ImportError:
......
...@@ -20,7 +20,7 @@ def import_processors(): ...@@ -20,7 +20,7 @@ def import_processors():
try: try:
module = importlib.import_module(name) module = importlib.import_module(name)
except Exception as e: except Exception as e:
logger.warning(f"Ignore import error when loading {name}: " f"{e}") logger.warning(f"Ignore import error when loading {name}: {e}")
continue continue
all_members = inspect.getmembers(module, inspect.isclass) all_members = inspect.getmembers(module, inspect.isclass)
classes = [ classes = [
......
...@@ -83,7 +83,7 @@ def import_model_classes(): ...@@ -83,7 +83,7 @@ def import_model_classes():
try: try:
module = importlib.import_module(name) module = importlib.import_module(name)
except Exception as e: except Exception as e:
logger.warning(f"Ignore import error when loading {name}. " f"{e}") logger.warning(f"Ignore import error when loading {name}: {e}")
continue continue
if hasattr(module, "EntryClass"): if hasattr(module, "EntryClass"):
entry = module.EntryClass entry = module.EntryClass
......
...@@ -83,7 +83,6 @@ class TestUtilsUpdateWeights(unittest.TestCase): ...@@ -83,7 +83,6 @@ class TestUtilsUpdateWeights(unittest.TestCase):
# Set up environment variables # Set up environment variables
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["NCCL_CUMEM_ENABLE"] = "0" os.environ["NCCL_CUMEM_ENABLE"] = "0"
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4" os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
os.environ["CUDA_MODULE_LOADING"] = "AUTO" os.environ["CUDA_MODULE_LOADING"] = "AUTO"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment