add VLLM_OPTEST_URLS_PORT to load https from local

41b09879 · zhuwenwen · 4c8e606b · 41b09879 · 41b09879 · 41b09879
Commit 41b09879 authored Nov 29, 2024 by zhuwenwen
9 changed files
--- a/README.md
+++ b/README.md
@@ -8,26 +8,26 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention


 ## 支持模型结构列表
-| 结构 | 模型 | 模型并行 | FP16 |
+| 结构 | 模型 | FP16/BF16 | AWQ | GPTQ |
 | :------: | :------: | :------: | :------: |
-| LlamaForCausalLM      | Llama 3.1,Llama 3,Llama 2,Llama,Yi,Codellama,deepseek       | Yes | Yes |  
-| QWenLMHeadModel       | QWen,Qwen-VL                                                | Yes | Yes |
-| Qwen2ForCausalLM      | QWen2,QWen1.5,CodeQwen1.5                                   | Yes | Yes |
-| ChatGLMModel          | glm-4v-9b,chatglm3,chatglm2                                 | Yes | Yes |
-| DeepseekV2ForCausalLM | DeepSeek-V2                                                 | Yes | Yes |
-| BaiChuanForCausalLM   | Baichuan2,Baichuan                                          | Yes | Yes |
-| BloomForCausalLM      | BLOOM                                                       | Yes | Yes |
-| InternLMForCausalLM   | InternLM                                                    | Yes | Yes |
-| InternLM2ForCausalLM  | InternLM2                                                   | Yes | Yes |
-| TeleChat12BForCausalLM (#TelechatForCausalLM) | TeleChat-12B                        | Yes | Yes |
-| MiniCPMForCausalLM    | MiniCPM                                                     | Yes | Yes |
-| MiniCPM3ForCausalLM   | MiniCPM3                                                    | Yes | Yes |
-| MixtralForCausalLM    | Mixtral-8x7B,Mixtral-8x7B-Instruct                          | Yes | Yes |
-| Qwen2MoeForCausalLM                 | Qwen2-57B-A14B,Qwen2-57B-A14B-Instruct        | Yes | Yes |
-| LlavaForConditionalGeneration       | LLaMA,LLaMA-2,LLaMA-3                         | Yes | Yes |
-| Qwen2VLForConditionalGeneration     | Qwen2-VL                                      | Yes | Yes |
-| MiniCPMV                            | MiniCPM-V                                     | Yes | Yes |
-| Phi3VForCausalLM                    | Phi-3.5-vision                                | Yes | Yes |
+| LlamaForCausalLM      | Llama 3.1,Llama 3,Llama 2,Llama,Yi,Codellama,deepseek       | Yes | Yes | Yes |  
+| QWenLMHeadModel       | QWen,Qwen-VL                                                | Yes | Yes | Yes |
+| Qwen2ForCausalLM      | QWen2,QWen1.5,CodeQwen1.5                                   | Yes | Yes | Yes |
+| ChatGLMModel          | glm-4v-9b,chatglm3,chatglm2                                 | Yes | No  | Yes |
+| DeepseekV2ForCausalLM | DeepSeek-V2                                                 | Yes | No  | -   |
+| BaiChuanForCausalLM   | Baichuan2,Baichuan                                          | Yes | Yes | -   |
+| BloomForCausalLM      | BLOOM                                                       | Yes | No  | -   |
+| InternLMForCausalLM   | InternLM                                                    | Yes | No  | -   |
+| InternLM2ForCausalLM  | InternLM2                                                   | Yes | No  | -   |
+| TeleChat12BForCausalLM (#TelechatForCausalLM) | TeleChat-12B                        | Yes | No  | -   |
+| MiniCPMForCausalLM    | MiniCPM                                                     | Yes | No  | -   |
+| MiniCPM3ForCausalLM   | MiniCPM3                                                    | Yes | No  | -   |
+| MixtralForCausalLM    | Mixtral-8x7B,Mixtral-8x7B-Instruct                          | Yes | No  | -   |
+| Qwen2MoeForCausalLM                 | Qwen2-57B-A14B,Qwen2-57B-A14B-Instruct        | Yes | No  | -   |
+| LlavaForConditionalGeneration       | LLaMA,LLaMA-2,LLaMA-3                         | Yes | No  | -   |
+| Qwen2VLForConditionalGeneration     | Qwen2-VL                                      | Yes | No  | Yes |
+| MiniCPMV                            | MiniCPM-V                                     | Yes | No  | -   |
+| Phi3VForCausalLM                    | Phi-3.5-vision                                | Yes | No  | -   |


 ## 安装

--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test
 aiohttp

 # quantization
-bitsandbytes>=0.44.0
+# bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.8
--- a/tests/entrypoints/openai/test_accuracy.py
+++ b/tests/entrypoints/openai/test_accuracy.py
@@ -11,8 +11,7 @@ import lm_eval
 import pytest
 import os

-from ...utils import RemoteOpenAIServer
-from ...utils import models_path_prefix
+from ...utils import RemoteOpenAIServer, models_path_prefix

 MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
 NUM_CONCURRENT = 500

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -7,7 +7,7 @@ import pytest_asyncio

 from vllm.multimodal.utils import encode_image_base64, fetch_image

-from ...utils import RemoteOpenAIServer, models_path_prefix
+from ...utils import RemoteOpenAIServer, models_path_prefix, urls_port

 MODEL_NAME = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")
 MAXIMUM_IMAGES = 2
@@ -22,10 +22,10 @@ MAXIMUM_IMAGES = 2
 #     "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 # ]
 TEST_IMAGE_URLS = [
-    os.path.join(models_path_prefix, "vision/nature_boardwalk.jpg"), 
-    os.path.join(models_path_prefix, "vision/Grayscale_8bits_palette_sample_image.png"),
-    os.path.join(models_path_prefix, "vision/1280px-Venn_diagram_rgb.svg.png"),
-    os.path.join(models_path_prefix, "vision/RGBA_comp.png"),
+    f"http://localhost:{urls_port}/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    f"http://localhost:{urls_port}/Grayscale_8bits_palette_sample_image.png",
+    f"http://localhost:{urls_port}/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    f"http://localhost:{urls_port}/RGBA_comp.png",
 ]



--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -25,7 +25,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader import get_model
-from utils import models_path_prefix
+from ..utils import models_path_prefix


 class ContextIDInfo(TypedDict):

--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -10,13 +10,14 @@ from transformers import AutoConfig, AutoTokenizer

 from vllm.multimodal.utils import (async_fetch_image, fetch_image,
                                   repeat_and_pad_placeholder_tokens)
+from ..utils import urls_port

 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+    f"http://localhost:{urls_port}/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    f"http://localhost:{urls_port}/Grayscale_8bits_palette_sample_image.png",
+    f"http://localhost:{urls_port}/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    f"http://localhost:{urls_port}/RGBA_comp.png",
 ]



--- a/tests/utils.py
+++ b/tests/utils.py
@@ -30,6 +30,7 @@ import os


 models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_MODELS_PATH")
+urls_port = int(os.getenv('VLLM_OPTEST_URLS_PORT', '8000'))

 if current_platform.is_rocm():
    from amdsmi import (amdsmi_get_gpu_vram_usage,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -5,6 +5,8 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
 if TYPE_CHECKING:
    VLLM_HOST_IP: str = ""
    VLLM_PORT: Optional[int] = None
+    VLLM_OPTEST_URLS_PORT: Optional[int] = None
+    VLLM_OPTEST_MODELS_PATH: str = ""
    VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()
    VLLM_USE_MODELSCOPE: bool = False
    VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
@@ -15,7 +17,6 @@ if TYPE_CHECKING:
    VLLM_USE_OPT_OP: bool = False
    VLLM_USE_TC_PAGED_ATTN: bool = False
    VLLM_USE_PA_PRINT_PARAM: bool = False 
-    VLLM_OPTEST_MODELS_PATH: str = ""
    LOCAL_RANK: int = 0
    CUDA_VISIBLE_DEVICES: Optional[str] = None
    VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
@@ -160,6 +161,16 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    'VLLM_PORT':
    lambda: int(os.getenv('VLLM_PORT', '0'))
    if 'VLLM_PORT' in os.environ else None,
+    
+    # used in optest environment to manually set the https port
+    'VLLM_OPTEST_URLS_PORT':
+    lambda: int(os.getenv('VLLM_OPTEST_URLS_PORT', '8000'))
+    if 'VLLM_OPTEST_URLS_PORT' in os.environ else None,
+    
+    # Path to the optest models.
+    # If set, will load models from local path instead of Hugging Face Hub.
+    'VLLM_OPTEST_MODELS_PATH':
+    lambda: os.getenv('VLLM_OPTEST_MODELS_PATH', "") or os.getenv("OPTEST_MODELS_PATH", ""),

    # path used for ipc when the frontend api server is running in
    # multi-processing mode to communicate with the backend engine process.
@@ -214,11 +225,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_USE_PA_PRINT_PARAM":
    lambda: (os.environ.get("VLLM_USE_PA_PRINT_PARAM", "False").lower() in
             ("true", "1")),
-    
-    # Path to the optest models.
-    # If set, will load models from local path instead of Hugging Face Hub.
-    'VLLM_OPTEST_MODELS_PATH':
-    lambda: os.getenv('VLLM_OPTEST_MODELS_PATH', "") or os.getenv("OPTEST_MODELS_PATH", ""),

    # If set, allowing the use of deprecated beam search implementation
    "VLLM_ALLOW_DEPRECATED_BEAM_SEARCH":

--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -141,11 +141,11 @@ _ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
    # _ROCM_SWA_REASON,
    "PaliGemmaForConditionalGeneration":
    ("ROCm flash attention does not yet "
-     "fully support 32-bit precision on PaliGemma"),
-    "Phi3VForCausalLM":
-    ("ROCm Triton flash attention may run into compilation errors due to "
-     "excessive use of shared memory. If this happens, disable Triton FA "
-     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
+     "fully support 32-bit precision on PaliGemma")
+    # "Phi3VForCausalLM":
+    # ("ROCm Triton flash attention may run into compilation errors due to "
+    #  "excessive use of shared memory. If this happens, disable Triton FA "
+    #  "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
 }