Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
41b09879
Commit
41b09879
authored
Nov 29, 2024
by
zhuwenwen
Browse files
add VLLM_OPTEST_URLS_PORT to load https from local
parent
4c8e606b
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
50 additions
and
43 deletions
+50
-43
README.md
README.md
+19
-19
requirements-test.txt
requirements-test.txt
+1
-1
tests/entrypoints/openai/test_accuracy.py
tests/entrypoints/openai/test_accuracy.py
+1
-2
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision.py
+5
-5
tests/lora/conftest.py
tests/lora/conftest.py
+1
-1
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+5
-4
tests/utils.py
tests/utils.py
+1
-0
vllm/envs.py
vllm/envs.py
+12
-6
vllm/model_executor/models/__init__.py
vllm/model_executor/models/__init__.py
+5
-5
No files found.
README.md
View file @
41b09879
...
...
@@ -8,26 +8,26 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
## 支持模型结构列表
| 结构 | 模型 |
模型并行 | FP16
|
| 结构 | 模型 |
FP16/BF16 | AWQ | GPTQ
|
| :------: | :------: | :------: | :------: |
| LlamaForCausalLM | Llama 3.1,Llama 3,Llama 2,Llama,Yi,Codellama,deepseek | Yes | Yes |
| QWenLMHeadModel | QWen,Qwen-VL | Yes | Yes |
| Qwen2ForCausalLM | QWen2,QWen1.5,CodeQwen1.5 | Yes | Yes |
| ChatGLMModel | glm-4v-9b,chatglm3,chatglm2 | Yes | Yes |
| DeepseekV2ForCausalLM | DeepSeek-V2 | Yes |
Yes
|
| BaiChuanForCausalLM | Baichuan2,Baichuan | Yes | Yes |
| BloomForCausalLM | BLOOM | Yes |
Yes
|
| InternLMForCausalLM | InternLM | Yes |
Yes
|
| InternLM2ForCausalLM | InternLM2 | Yes |
Yes
|
| TeleChat12BForCausalLM (#TelechatForCausalLM) | TeleChat-12B | Yes |
Yes
|
| MiniCPMForCausalLM | MiniCPM | Yes |
Yes
|
| MiniCPM3ForCausalLM | MiniCPM3 | Yes |
Yes
|
| MixtralForCausalLM | Mixtral-8x7B,Mixtral-8x7B-Instruct | Yes |
Yes
|
| Qwen2MoeForCausalLM | Qwen2-57B-A14B,Qwen2-57B-A14B-Instruct | Yes |
Yes
|
| LlavaForConditionalGeneration | LLaMA,LLaMA-2,LLaMA-3 | Yes |
Yes
|
| Qwen2VLForConditionalGeneration | Qwen2-VL | Yes | Yes |
| MiniCPMV | MiniCPM-V | Yes |
Yes
|
| Phi3VForCausalLM | Phi-3.5-vision | Yes |
Yes
|
| LlamaForCausalLM | Llama 3.1,Llama 3,Llama 2,Llama,Yi,Codellama,deepseek | Yes | Yes |
Yes |
| QWenLMHeadModel | QWen,Qwen-VL | Yes | Yes |
Yes |
| Qwen2ForCausalLM | QWen2,QWen1.5,CodeQwen1.5 | Yes | Yes |
Yes |
| ChatGLMModel | glm-4v-9b,chatglm3,chatglm2 | Yes |
No |
Yes |
| DeepseekV2ForCausalLM | DeepSeek-V2 | Yes |
No | -
|
| BaiChuanForCausalLM | Baichuan2,Baichuan | Yes | Yes |
- |
| BloomForCausalLM | BLOOM | Yes |
No | -
|
| InternLMForCausalLM | InternLM | Yes |
No | -
|
| InternLM2ForCausalLM | InternLM2 | Yes |
No | -
|
| TeleChat12BForCausalLM (#TelechatForCausalLM) | TeleChat-12B | Yes |
No | -
|
| MiniCPMForCausalLM | MiniCPM | Yes |
No | -
|
| MiniCPM3ForCausalLM | MiniCPM3 | Yes |
No | -
|
| MixtralForCausalLM | Mixtral-8x7B,Mixtral-8x7B-Instruct | Yes |
No | -
|
| Qwen2MoeForCausalLM | Qwen2-57B-A14B,Qwen2-57B-A14B-Instruct | Yes |
No | -
|
| LlavaForConditionalGeneration | LLaMA,LLaMA-2,LLaMA-3 | Yes |
No | -
|
| Qwen2VLForConditionalGeneration | Qwen2-VL | Yes |
No |
Yes |
| MiniCPMV | MiniCPM-V | Yes |
No | -
|
| Phi3VForCausalLM | Phi-3.5-vision | Yes |
No | -
|
## 安装
...
...
requirements-test.txt
View file @
41b09879
...
...
@@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test
aiohttp
# quantization
bitsandbytes>=0.44.0
#
bitsandbytes>=0.44.0
buildkite-test-collector==0.1.8
tests/entrypoints/openai/test_accuracy.py
View file @
41b09879
...
...
@@ -11,8 +11,7 @@ import lm_eval
import
pytest
import
os
from
...utils
import
RemoteOpenAIServer
from
...utils
import
models_path_prefix
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
NUM_CONCURRENT
=
500
...
...
tests/entrypoints/openai/test_vision.py
View file @
41b09879
...
...
@@ -7,7 +7,7 @@ import pytest_asyncio
from
vllm.multimodal.utils
import
encode_image_base64
,
fetch_image
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
,
urls_port
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)
MAXIMUM_IMAGES
=
2
...
...
@@ -22,10 +22,10 @@ MAXIMUM_IMAGES = 2
# "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
# ]
TEST_IMAGE_URLS
=
[
os
.
path
.
join
(
models_path_prefix
,
"vision/
nature
_
boardwalk.jpg"
),
os
.
path
.
join
(
models_path_prefix
,
"vision
/Grayscale_8bits_palette_sample_image.png"
)
,
os
.
path
.
join
(
models_path_prefix
,
"vision
/1280px-Venn_diagram_rgb.svg.png"
)
,
os
.
path
.
join
(
models_path_prefix
,
"vision
/RGBA_comp.png"
)
,
f
"http://localhost:
{
urls_port
}
/2560px-Gfp-wisconsin-madison-the-
nature
-
boardwalk.jpg"
,
f
"http://localhost:
{
urls_port
}
/Grayscale_8bits_palette_sample_image.png"
,
f
"http://localhost:
{
urls_port
}
/Venn_diagram_rgb.svg
/1280px-Venn_diagram_rgb.svg.png"
,
f
"http://localhost:
{
urls_port
}
/RGBA_comp.png"
,
]
...
...
tests/lora/conftest.py
View file @
41b09879
...
...
@@ -25,7 +25,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader
import
get_model
from
utils
import
models_path_prefix
from
..
utils
import
models_path_prefix
class
ContextIDInfo
(
TypedDict
):
...
...
tests/multimodal/test_utils.py
View file @
41b09879
...
...
@@ -10,13 +10,14 @@ from transformers import AutoConfig, AutoTokenizer
from
vllm.multimodal.utils
import
(
async_fetch_image
,
fetch_image
,
repeat_and_pad_placeholder_tokens
)
from
..utils
import
urls_port
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS
=
[
"http
s
://
upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg
/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
"http
s
://
upload.wikimedia.org/wikipedia/commons/f/fa
/Grayscale_8bits_palette_sample_image.png"
,
"http
s
://
upload.wikimedia.org/wikipedia/commons/thumb/9/91
/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png"
,
"http
s
://
upload.wikimedia.org/wikipedia/commons/0/0b
/RGBA_comp.png"
,
f
"http://
localhost:
{
urls_port
}
/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
f
"http://
localhost:
{
urls_port
}
/Grayscale_8bits_palette_sample_image.png"
,
f
"http://
localhost:
{
urls_port
}
/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png"
,
f
"http://
localhost:
{
urls_port
}
/RGBA_comp.png"
,
]
...
...
tests/utils.py
View file @
41b09879
...
...
@@ -30,6 +30,7 @@ import os
models_path_prefix
=
os
.
getenv
(
'VLLM_OPTEST_MODELS_PATH'
)
or
os
.
getenv
(
"OPTEST_MODELS_PATH"
)
urls_port
=
int
(
os
.
getenv
(
'VLLM_OPTEST_URLS_PORT'
,
'8000'
))
if
current_platform
.
is_rocm
():
from
amdsmi
import
(
amdsmi_get_gpu_vram_usage
,
...
...
vllm/envs.py
View file @
41b09879
...
...
@@ -5,6 +5,8 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
if
TYPE_CHECKING
:
VLLM_HOST_IP
:
str
=
""
VLLM_PORT
:
Optional
[
int
]
=
None
VLLM_OPTEST_URLS_PORT
:
Optional
[
int
]
=
None
VLLM_OPTEST_MODELS_PATH
:
str
=
""
VLLM_RPC_BASE_PATH
:
str
=
tempfile
.
gettempdir
()
VLLM_USE_MODELSCOPE
:
bool
=
False
VLLM_RINGBUFFER_WARNING_INTERVAL
:
int
=
60
...
...
@@ -15,7 +17,6 @@ if TYPE_CHECKING:
VLLM_USE_OPT_OP
:
bool
=
False
VLLM_USE_TC_PAGED_ATTN
:
bool
=
False
VLLM_USE_PA_PRINT_PARAM
:
bool
=
False
VLLM_OPTEST_MODELS_PATH
:
str
=
""
LOCAL_RANK
:
int
=
0
CUDA_VISIBLE_DEVICES
:
Optional
[
str
]
=
None
VLLM_ENGINE_ITERATION_TIMEOUT_S
:
int
=
60
...
...
@@ -160,6 +161,16 @@ environment_variables: Dict[str, Callable[[], Any]] = {
'VLLM_PORT'
:
lambda
:
int
(
os
.
getenv
(
'VLLM_PORT'
,
'0'
))
if
'VLLM_PORT'
in
os
.
environ
else
None
,
# used in optest environment to manually set the https port
'VLLM_OPTEST_URLS_PORT'
:
lambda
:
int
(
os
.
getenv
(
'VLLM_OPTEST_URLS_PORT'
,
'8000'
))
if
'VLLM_OPTEST_URLS_PORT'
in
os
.
environ
else
None
,
# Path to the optest models.
# If set, will load models from local path instead of Hugging Face Hub.
'VLLM_OPTEST_MODELS_PATH'
:
lambda
:
os
.
getenv
(
'VLLM_OPTEST_MODELS_PATH'
,
""
)
or
os
.
getenv
(
"OPTEST_MODELS_PATH"
,
""
),
# path used for ipc when the frontend api server is running in
# multi-processing mode to communicate with the backend engine process.
...
...
@@ -214,11 +225,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_USE_PA_PRINT_PARAM"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PA_PRINT_PARAM"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# Path to the optest models.
# If set, will load models from local path instead of Hugging Face Hub.
'VLLM_OPTEST_MODELS_PATH'
:
lambda
:
os
.
getenv
(
'VLLM_OPTEST_MODELS_PATH'
,
""
)
or
os
.
getenv
(
"OPTEST_MODELS_PATH"
,
""
),
# If set, allowing the use of deprecated beam search implementation
"VLLM_ALLOW_DEPRECATED_BEAM_SEARCH"
:
...
...
vllm/model_executor/models/__init__.py
View file @
41b09879
...
...
@@ -141,11 +141,11 @@ _ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
# _ROCM_SWA_REASON,
"PaliGemmaForConditionalGeneration"
:
(
"ROCm flash attention does not yet "
"fully support 32-bit precision on PaliGemma"
)
,
"Phi3VForCausalLM"
:
(
"ROCm Triton flash attention may run into compilation errors due to "
"excessive use of shared memory. If this happens, disable Triton FA "
"by setting `VLLM_USE_TRITON_FLASH_ATTN=0`"
)
"fully support 32-bit precision on PaliGemma"
)
#
"Phi3VForCausalLM":
#
("ROCm Triton flash attention may run into compilation errors due to "
#
"excessive use of shared memory. If this happens, disable Triton FA "
#
"by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment