Commit 41b09879 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_OPTEST_URLS_PORT to load https from local

parent 4c8e606b
...@@ -8,26 +8,26 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention ...@@ -8,26 +8,26 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
## 支持模型结构列表 ## 支持模型结构列表
| 结构 | 模型 | 模型并行 | FP16 | | 结构 | 模型 | FP16/BF16 | AWQ | GPTQ |
| :------: | :------: | :------: | :------: | | :------: | :------: | :------: | :------: |
| LlamaForCausalLM | Llama 3.1,Llama 3,Llama 2,Llama,Yi,Codellama,deepseek | Yes | Yes | | LlamaForCausalLM | Llama 3.1,Llama 3,Llama 2,Llama,Yi,Codellama,deepseek | Yes | Yes | Yes |
| QWenLMHeadModel | QWen,Qwen-VL | Yes | Yes | | QWenLMHeadModel | QWen,Qwen-VL | Yes | Yes | Yes |
| Qwen2ForCausalLM | QWen2,QWen1.5,CodeQwen1.5 | Yes | Yes | | Qwen2ForCausalLM | QWen2,QWen1.5,CodeQwen1.5 | Yes | Yes | Yes |
| ChatGLMModel | glm-4v-9b,chatglm3,chatglm2 | Yes | Yes | | ChatGLMModel | glm-4v-9b,chatglm3,chatglm2 | Yes | No | Yes |
| DeepseekV2ForCausalLM | DeepSeek-V2 | Yes | Yes | | DeepseekV2ForCausalLM | DeepSeek-V2 | Yes | No | - |
| BaiChuanForCausalLM | Baichuan2,Baichuan | Yes | Yes | | BaiChuanForCausalLM | Baichuan2,Baichuan | Yes | Yes | - |
| BloomForCausalLM | BLOOM | Yes | Yes | | BloomForCausalLM | BLOOM | Yes | No | - |
| InternLMForCausalLM | InternLM | Yes | Yes | | InternLMForCausalLM | InternLM | Yes | No | - |
| InternLM2ForCausalLM | InternLM2 | Yes | Yes | | InternLM2ForCausalLM | InternLM2 | Yes | No | - |
| TeleChat12BForCausalLM (#TelechatForCausalLM) | TeleChat-12B | Yes | Yes | | TeleChat12BForCausalLM (#TelechatForCausalLM) | TeleChat-12B | Yes | No | - |
| MiniCPMForCausalLM | MiniCPM | Yes | Yes | | MiniCPMForCausalLM | MiniCPM | Yes | No | - |
| MiniCPM3ForCausalLM | MiniCPM3 | Yes | Yes | | MiniCPM3ForCausalLM | MiniCPM3 | Yes | No | - |
| MixtralForCausalLM | Mixtral-8x7B,Mixtral-8x7B-Instruct | Yes | Yes | | MixtralForCausalLM | Mixtral-8x7B,Mixtral-8x7B-Instruct | Yes | No | - |
| Qwen2MoeForCausalLM | Qwen2-57B-A14B,Qwen2-57B-A14B-Instruct | Yes | Yes | | Qwen2MoeForCausalLM | Qwen2-57B-A14B,Qwen2-57B-A14B-Instruct | Yes | No | - |
| LlavaForConditionalGeneration | LLaMA,LLaMA-2,LLaMA-3 | Yes | Yes | | LlavaForConditionalGeneration | LLaMA,LLaMA-2,LLaMA-3 | Yes | No | - |
| Qwen2VLForConditionalGeneration | Qwen2-VL | Yes | Yes | | Qwen2VLForConditionalGeneration | Qwen2-VL | Yes | No | Yes |
| MiniCPMV | MiniCPM-V | Yes | Yes | | MiniCPMV | MiniCPM-V | Yes | No | - |
| Phi3VForCausalLM | Phi-3.5-vision | Yes | Yes | | Phi3VForCausalLM | Phi-3.5-vision | Yes | No | - |
## 安装 ## 安装
......
...@@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test ...@@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test
aiohttp aiohttp
# quantization # quantization
bitsandbytes>=0.44.0 # bitsandbytes>=0.44.0
buildkite-test-collector==0.1.8 buildkite-test-collector==0.1.8
...@@ -11,8 +11,7 @@ import lm_eval ...@@ -11,8 +11,7 @@ import lm_eval
import pytest import pytest
import os import os
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer, models_path_prefix
from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct") MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
NUM_CONCURRENT = 500 NUM_CONCURRENT = 500
......
...@@ -7,7 +7,7 @@ import pytest_asyncio ...@@ -7,7 +7,7 @@ import pytest_asyncio
from vllm.multimodal.utils import encode_image_base64, fetch_image from vllm.multimodal.utils import encode_image_base64, fetch_image
from ...utils import RemoteOpenAIServer, models_path_prefix from ...utils import RemoteOpenAIServer, models_path_prefix, urls_port
MODEL_NAME = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct") MODEL_NAME = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")
MAXIMUM_IMAGES = 2 MAXIMUM_IMAGES = 2
...@@ -22,10 +22,10 @@ MAXIMUM_IMAGES = 2 ...@@ -22,10 +22,10 @@ MAXIMUM_IMAGES = 2
# "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
# ] # ]
TEST_IMAGE_URLS = [ TEST_IMAGE_URLS = [
os.path.join(models_path_prefix, "vision/nature_boardwalk.jpg"), f"http://localhost:{urls_port}/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
os.path.join(models_path_prefix, "vision/Grayscale_8bits_palette_sample_image.png"), f"http://localhost:{urls_port}/Grayscale_8bits_palette_sample_image.png",
os.path.join(models_path_prefix, "vision/1280px-Venn_diagram_rgb.svg.png"), f"http://localhost:{urls_port}/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
os.path.join(models_path_prefix, "vision/RGBA_comp.png"), f"http://localhost:{urls_port}/RGBA_comp.png",
] ]
......
...@@ -25,7 +25,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor ...@@ -25,7 +25,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader import get_model
from utils import models_path_prefix from ..utils import models_path_prefix
class ContextIDInfo(TypedDict): class ContextIDInfo(TypedDict):
......
...@@ -10,13 +10,14 @@ from transformers import AutoConfig, AutoTokenizer ...@@ -10,13 +10,14 @@ from transformers import AutoConfig, AutoTokenizer
from vllm.multimodal.utils import (async_fetch_image, fetch_image, from vllm.multimodal.utils import (async_fetch_image, fetch_image,
repeat_and_pad_placeholder_tokens) repeat_and_pad_placeholder_tokens)
from ..utils import urls_port
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [ TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", f"http://localhost:{urls_port}/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", f"http://localhost:{urls_port}/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", f"http://localhost:{urls_port}/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", f"http://localhost:{urls_port}/RGBA_comp.png",
] ]
......
...@@ -30,6 +30,7 @@ import os ...@@ -30,6 +30,7 @@ import os
models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_MODELS_PATH") models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_MODELS_PATH")
urls_port = int(os.getenv('VLLM_OPTEST_URLS_PORT', '8000'))
if current_platform.is_rocm(): if current_platform.is_rocm():
from amdsmi import (amdsmi_get_gpu_vram_usage, from amdsmi import (amdsmi_get_gpu_vram_usage,
......
...@@ -5,6 +5,8 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional ...@@ -5,6 +5,8 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
if TYPE_CHECKING: if TYPE_CHECKING:
VLLM_HOST_IP: str = "" VLLM_HOST_IP: str = ""
VLLM_PORT: Optional[int] = None VLLM_PORT: Optional[int] = None
VLLM_OPTEST_URLS_PORT: Optional[int] = None
VLLM_OPTEST_MODELS_PATH: str = ""
VLLM_RPC_BASE_PATH: str = tempfile.gettempdir() VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()
VLLM_USE_MODELSCOPE: bool = False VLLM_USE_MODELSCOPE: bool = False
VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60 VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
...@@ -15,7 +17,6 @@ if TYPE_CHECKING: ...@@ -15,7 +17,6 @@ if TYPE_CHECKING:
VLLM_USE_OPT_OP: bool = False VLLM_USE_OPT_OP: bool = False
VLLM_USE_TC_PAGED_ATTN: bool = False VLLM_USE_TC_PAGED_ATTN: bool = False
VLLM_USE_PA_PRINT_PARAM: bool = False VLLM_USE_PA_PRINT_PARAM: bool = False
VLLM_OPTEST_MODELS_PATH: str = ""
LOCAL_RANK: int = 0 LOCAL_RANK: int = 0
CUDA_VISIBLE_DEVICES: Optional[str] = None CUDA_VISIBLE_DEVICES: Optional[str] = None
VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60 VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
...@@ -160,6 +161,16 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -160,6 +161,16 @@ environment_variables: Dict[str, Callable[[], Any]] = {
'VLLM_PORT': 'VLLM_PORT':
lambda: int(os.getenv('VLLM_PORT', '0')) lambda: int(os.getenv('VLLM_PORT', '0'))
if 'VLLM_PORT' in os.environ else None, if 'VLLM_PORT' in os.environ else None,
# used in optest environment to manually set the https port
'VLLM_OPTEST_URLS_PORT':
lambda: int(os.getenv('VLLM_OPTEST_URLS_PORT', '8000'))
if 'VLLM_OPTEST_URLS_PORT' in os.environ else None,
# Path to the optest models.
# If set, will load models from local path instead of Hugging Face Hub.
'VLLM_OPTEST_MODELS_PATH':
lambda: os.getenv('VLLM_OPTEST_MODELS_PATH', "") or os.getenv("OPTEST_MODELS_PATH", ""),
# path used for ipc when the frontend api server is running in # path used for ipc when the frontend api server is running in
# multi-processing mode to communicate with the backend engine process. # multi-processing mode to communicate with the backend engine process.
...@@ -214,11 +225,6 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -214,11 +225,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_USE_PA_PRINT_PARAM": "VLLM_USE_PA_PRINT_PARAM":
lambda: (os.environ.get("VLLM_USE_PA_PRINT_PARAM", "False").lower() in lambda: (os.environ.get("VLLM_USE_PA_PRINT_PARAM", "False").lower() in
("true", "1")), ("true", "1")),
# Path to the optest models.
# If set, will load models from local path instead of Hugging Face Hub.
'VLLM_OPTEST_MODELS_PATH':
lambda: os.getenv('VLLM_OPTEST_MODELS_PATH', "") or os.getenv("OPTEST_MODELS_PATH", ""),
# If set, allowing the use of deprecated beam search implementation # If set, allowing the use of deprecated beam search implementation
"VLLM_ALLOW_DEPRECATED_BEAM_SEARCH": "VLLM_ALLOW_DEPRECATED_BEAM_SEARCH":
......
...@@ -141,11 +141,11 @@ _ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { ...@@ -141,11 +141,11 @@ _ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
# _ROCM_SWA_REASON, # _ROCM_SWA_REASON,
"PaliGemmaForConditionalGeneration": "PaliGemmaForConditionalGeneration":
("ROCm flash attention does not yet " ("ROCm flash attention does not yet "
"fully support 32-bit precision on PaliGemma"), "fully support 32-bit precision on PaliGemma")
"Phi3VForCausalLM": # "Phi3VForCausalLM":
("ROCm Triton flash attention may run into compilation errors due to " # ("ROCm Triton flash attention may run into compilation errors due to "
"excessive use of shared memory. If this happens, disable Triton FA " # "excessive use of shared memory. If this happens, disable Triton FA "
"by setting `VLLM_USE_TRITON_FLASH_ATTN=0`") # "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment