Unverified Commit bf3e271f authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Update vllm to v0.4.3 (#511)


Co-authored-by: default avatarQubitium <417764+Qubitium@users.noreply.github.com>
Co-authored-by: default avatarZX <zx@lbx.dev>
parent 3bc01ac1
...@@ -20,7 +20,7 @@ dependencies = [ ...@@ -20,7 +20,7 @@ dependencies = [
[project.optional-dependencies] [project.optional-dependencies]
srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn", srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
"zmq", "vllm==0.4.2", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"] "zmq", "vllm==0.4.3", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"]
openai = ["openai>=1.0", "numpy", "tiktoken"] openai = ["openai>=1.0", "numpy", "tiktoken"]
anthropic = ["anthropic>=0.20.0", "numpy"] anthropic = ["anthropic>=0.20.0", "numpy"]
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"] all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
......
...@@ -11,7 +11,7 @@ import torch ...@@ -11,7 +11,7 @@ import torch
import torch.nn as nn import torch.nn as nn
from vllm.config import DeviceConfig, LoadConfig from vllm.config import DeviceConfig, LoadConfig
from vllm.config import ModelConfig as VllmModelConfig from vllm.config import ModelConfig as VllmModelConfig
from vllm.distributed import initialize_model_parallel from vllm.distributed import initialize_model_parallel, init_distributed_environment
from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models import ModelRegistry
...@@ -240,11 +240,11 @@ class ModelRunner: ...@@ -240,11 +240,11 @@ class ModelRunner:
logger.info(f"[gpu_id={self.gpu_id}] Set cuda device.") logger.info(f"[gpu_id={self.gpu_id}] Set cuda device.")
torch.cuda.set_device(self.gpu_id) torch.cuda.set_device(self.gpu_id)
logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.") logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.")
torch.distributed.init_process_group( init_distributed_environment(
backend="nccl", backend="nccl",
world_size=self.tp_size, world_size=self.tp_size,
rank=self.tp_rank, rank=self.tp_rank,
init_method=f"tcp://127.0.0.1:{self.nccl_port}", distributed_init_method=f"tcp://127.0.0.1:{self.nccl_port}",
) )
initialize_model_parallel(tensor_model_parallel_size=self.tp_size) initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
total_gpu_memory = get_available_gpu_memory( total_gpu_memory = get_available_gpu_memory(
...@@ -291,6 +291,7 @@ class ModelRunner: ...@@ -291,6 +291,7 @@ class ModelRunner:
vision_language_config=None, vision_language_config=None,
parallel_config=None, parallel_config=None,
scheduler_config=None, scheduler_config=None,
cache_config=None,
) )
logger.info( logger.info(
f"[gpu_id={self.gpu_id}] Load weight end. " f"[gpu_id={self.gpu_id}] Load weight end. "
......
...@@ -30,6 +30,7 @@ import torch.utils.checkpoint ...@@ -30,6 +30,7 @@ import torch.utils.checkpoint
from torch import nn from torch import nn
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import CacheConfig
from vllm.distributed import ( from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
...@@ -304,6 +305,7 @@ class CohereForCausalLM(nn.Module): ...@@ -304,6 +305,7 @@ class CohereForCausalLM(nn.Module):
self, self,
config: PretrainedConfig, config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
......
...@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Tuple ...@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Tuple
import torch import torch
import torch.nn as nn import torch.nn as nn
from vllm.config import CacheConfig
from vllm.distributed import ( from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
...@@ -352,6 +353,7 @@ class DbrxForCausalLM(nn.Module): ...@@ -352,6 +353,7 @@ class DbrxForCausalLM(nn.Module):
self, self,
config: DbrxConfig, config: DbrxConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
): ):
super().__init__() super().__init__()
self.config = config self.config = config
......
...@@ -6,7 +6,7 @@ from typing import Iterable, Optional, Tuple ...@@ -6,7 +6,7 @@ from typing import Iterable, Optional, Tuple
import torch import torch
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import LoRAConfig from vllm.config import LoRAConfig, CacheConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import GeluAndMul from vllm.model_executor.layers.activation import GeluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
...@@ -264,6 +264,7 @@ class GemmaForCausalLM(nn.Module): ...@@ -264,6 +264,7 @@ class GemmaForCausalLM(nn.Module):
config: PretrainedConfig, config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
lora_config: Optional[LoRAConfig] = None, lora_config: Optional[LoRAConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
del lora_config # Unused. del lora_config # Unused.
super().__init__() super().__init__()
......
...@@ -11,6 +11,7 @@ from torch import nn ...@@ -11,6 +11,7 @@ from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import CacheConfig
from vllm.distributed import ( from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
...@@ -538,6 +539,7 @@ class Grok1ModelForCausalLM(nn.Module): ...@@ -538,6 +539,7 @@ class Grok1ModelForCausalLM(nn.Module):
self, self,
config: PretrainedConfig, config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
......
...@@ -7,6 +7,7 @@ import torch ...@@ -7,6 +7,7 @@ import torch
import tqdm import tqdm
from torch import nn from torch import nn
from transformers import LlamaConfig from transformers import LlamaConfig
from vllm.config import CacheConfig
from vllm.distributed import ( from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size get_tensor_model_parallel_world_size
...@@ -258,6 +259,7 @@ class LlamaForCausalLM(nn.Module): ...@@ -258,6 +259,7 @@ class LlamaForCausalLM(nn.Module):
self, self,
config: LlamaConfig, config: LlamaConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
......
...@@ -7,6 +7,7 @@ import torch ...@@ -7,6 +7,7 @@ import torch
from torch import nn from torch import nn
from transformers import CLIPVisionModel, CLIPVisionConfig, LlavaConfig, Qwen2Config, MistralConfig from transformers import CLIPVisionModel, CLIPVisionConfig, LlavaConfig, Qwen2Config, MistralConfig
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
from vllm.config import CacheConfig
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
...@@ -27,6 +28,7 @@ class LlavaLlamaForCausalLM(nn.Module): ...@@ -27,6 +28,7 @@ class LlavaLlamaForCausalLM(nn.Module):
self, self,
config: LlavaConfig, config: LlavaConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
...@@ -294,8 +296,9 @@ class LlavaQwenForCausalLM(LlavaLlamaForCausalLM): ...@@ -294,8 +296,9 @@ class LlavaQwenForCausalLM(LlavaLlamaForCausalLM):
self, self,
config: LlavaConfig, config: LlavaConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__(config, quant_config=quant_config) super().__init__(config, quant_config=quant_config, cache_config=cache_config)
self.config = config self.config = config
self.vision_tower = None self.vision_tower = None
if getattr(self.config, "vision_config", None) is None: if getattr(self.config, "vision_config", None) is None:
...@@ -356,8 +359,9 @@ class LlavaMistralForCausalLM(LlavaLlamaForCausalLM): ...@@ -356,8 +359,9 @@ class LlavaMistralForCausalLM(LlavaLlamaForCausalLM):
self, self,
config: LlavaConfig, config: LlavaConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__(config, quant_config=quant_config) super().__init__(config, quant_config=quant_config, cache_config=cache_config)
self.config = config self.config = config
self.vision_tower = None self.vision_tower = None
if getattr(self.config, "vision_config", None) is None: if getattr(self.config, "vision_config", None) is None:
......
...@@ -7,6 +7,7 @@ import torch ...@@ -7,6 +7,7 @@ import torch
from torch import nn from torch import nn
from transformers import CLIPVisionModel, LlavaConfig from transformers import CLIPVisionModel, LlavaConfig
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
from vllm.config import CacheConfig
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
...@@ -25,6 +26,7 @@ class LlavaVidForCausalLM(nn.Module): ...@@ -25,6 +26,7 @@ class LlavaVidForCausalLM(nn.Module):
self, self,
config: LlavaConfig, config: LlavaConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
......
...@@ -9,6 +9,7 @@ import torch.nn.functional as F ...@@ -9,6 +9,7 @@ import torch.nn.functional as F
from torch import nn from torch import nn
from transformers import MixtralConfig from transformers import MixtralConfig
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import CacheConfig
from vllm.distributed import ( from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
...@@ -414,6 +415,7 @@ class MixtralForCausalLM(nn.Module): ...@@ -414,6 +415,7 @@ class MixtralForCausalLM(nn.Module):
self, self,
config: MixtralConfig, config: MixtralConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
......
...@@ -8,6 +8,7 @@ import torch ...@@ -8,6 +8,7 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
from torch import nn from torch import nn
from transformers import MixtralConfig from transformers import MixtralConfig
from vllm.config import CacheConfig
from vllm.distributed import ( from vllm.distributed import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
...@@ -313,6 +314,7 @@ class QuantMixtralForCausalLM(nn.Module): ...@@ -313,6 +314,7 @@ class QuantMixtralForCausalLM(nn.Module):
self, self,
config: MixtralConfig, config: MixtralConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
......
...@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Iterable, Tuple ...@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Iterable, Tuple
import torch import torch
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import CacheConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
...@@ -227,6 +228,7 @@ class QWenLMHeadModel(nn.Module): ...@@ -227,6 +228,7 @@ class QWenLMHeadModel(nn.Module):
self, self,
config: PretrainedConfig, config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
): ):
super().__init__() super().__init__()
self.config = config self.config = config
......
...@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Tuple, Iterable ...@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Tuple, Iterable
import torch import torch
from torch import nn from torch import nn
from vllm.config import CacheConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
...@@ -251,6 +252,7 @@ class Qwen2ForCausalLM(nn.Module): ...@@ -251,6 +252,7 @@ class Qwen2ForCausalLM(nn.Module):
self, self,
config: Qwen2Config, config: Qwen2Config,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
......
...@@ -7,6 +7,7 @@ from typing import Optional, Tuple, Iterable ...@@ -7,6 +7,7 @@ from typing import Optional, Tuple, Iterable
import torch import torch
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import CacheConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
...@@ -225,6 +226,7 @@ class StableLmForCausalLM(nn.Module): ...@@ -225,6 +226,7 @@ class StableLmForCausalLM(nn.Module):
self, self,
config: PretrainedConfig, config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
......
"""Inference-only Yi-VL model.""" """Inference-only Yi-VL model."""
from typing import Tuple, Iterable from typing import Tuple, Iterable, Optional
import torch import torch
import torch.nn as nn import torch.nn as nn
from transformers import CLIPVisionModel, LlavaConfig from transformers import CLIPVisionModel, LlavaConfig
from vllm.config import CacheConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from sglang.srt.models.llava import ( from sglang.srt.models.llava import (
LlavaLlamaForCausalLM, LlavaLlamaForCausalLM,
monkey_path_clip_vision_embed_forward, monkey_path_clip_vision_embed_forward,
...@@ -15,9 +17,12 @@ from sglang.srt.models.llava import ( ...@@ -15,9 +17,12 @@ from sglang.srt.models.llava import (
class YiVLForCausalLM(LlavaLlamaForCausalLM): class YiVLForCausalLM(LlavaLlamaForCausalLM):
def __init__( def __init__(
self, config, quant_config = None, self,
config: LlavaConfig,
quant_config: Optional[QuantizationConfig] = None,
cache_config: Optional[CacheConfig] = None,
) -> None: ) -> None:
super().__init__(config, quant_config) super().__init__(config, quant_config, cache_config)
self.multi_modal_projector = YiVLMultiModalProjector(self.config) self.multi_modal_projector = YiVLMultiModalProjector(self.config)
self.vision_tower_subfolder = self.config.mm_vision_tower.replace( self.vision_tower_subfolder = self.config.mm_vision_tower.replace(
......
...@@ -421,9 +421,10 @@ def suppress_other_loggers(): ...@@ -421,9 +421,10 @@ def suppress_other_loggers():
from vllm.logger import logger as vllm_default_logger from vllm.logger import logger as vllm_default_logger
vllm_default_logger.setLevel(logging.WARN) vllm_default_logger.setLevel(logging.WARN)
logging.getLogger("vllm.utils").setLevel(logging.WARN)
logging.getLogger("vllm.selector").setLevel(logging.WARN)
logging.getLogger("vllm.config").setLevel(logging.ERROR) logging.getLogger("vllm.config").setLevel(logging.ERROR)
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(logging.WARN)
logging.getLogger("vllm.selector").setLevel(logging.WARN)
logging.getLogger("vllm.utils").setLevel(logging.WARN)
def assert_pkg_version(pkg: str, min_version: str): def assert_pkg_version(pkg: str, min_version: str):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment