Unverified Commit 2c7d0a5b authored by Theresa Barton's avatar Theresa Barton Committed by GitHub
Browse files

[Fix] Fix all the Huggingface paths (#1553)

parent 8cdc76f6
...@@ -81,7 +81,7 @@ docker run --gpus all \ ...@@ -81,7 +81,7 @@ docker run --gpus all \
--env "HF_TOKEN=<secret>" \ --env "HF_TOKEN=<secret>" \
--ipc=host \ --ipc=host \
lmsysorg/sglang:latest \ lmsysorg/sglang:latest \
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
``` ```
### Method 4: Using docker compose ### Method 4: Using docker compose
...@@ -121,7 +121,7 @@ resources: ...@@ -121,7 +121,7 @@ resources:
run: | run: |
conda deactivate conda deactivate
python3 -m sglang.launch_server \ python3 -m sglang.launch_server \
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ --model-path meta-llama/Llama-3.1-8B-Instruct \
--host 0.0.0.0 \ --host 0.0.0.0 \
--port 30000 --port 30000
``` ```
......
...@@ -58,12 +58,12 @@ We referred to the reproduction method in https://github.com/vllm-project/vllm/i ...@@ -58,12 +58,12 @@ We referred to the reproduction method in https://github.com/vllm-project/vllm/i
```bash ```bash
# Llama 3.1 8B Instruct on 1 x A100 # Llama 3.1 8B Instruct on 1 x A100
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096 python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
# Llama 3.1 70B Instruct on 4 x H100 # Llama 3.1 70B Instruct on 4 x H100
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4
python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096 python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
# bench serving # bench serving
python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 1200 --request-rate 4 python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 1200 --request-rate 4
...@@ -76,12 +76,12 @@ python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-pro ...@@ -76,12 +76,12 @@ python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-pro
```bash ```bash
# Llama 3.1 8B Instruct on 1 x A100 # Llama 3.1 8B Instruct on 1 x A100
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096 python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
# Llama 3.1 70B Instruct on 4 x H100 # Llama 3.1 70B Instruct on 4 x H100
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 --mem-frac 0.88 python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 --mem-frac 0.88
python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096 python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
# bench serving # bench serving
python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 5000 python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 5000
......
...@@ -27,10 +27,10 @@ export HF_TOKEN=hf_token ...@@ -27,10 +27,10 @@ export HF_TOKEN=hf_token
```bash ```bash
# Meta-Llama-3.1-8B-Instruct # Meta-Llama-3.1-8B-Instruct
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
# Meta-Llama-3.1-70B-Instruct # Meta-Llama-3.1-70B-Instruct
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --disable-radix-cache --tp 8 python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 8
# Meta-Llama-3-70B-Instruct-FP8 # Meta-Llama-3-70B-Instruct-FP8
python -m sglang.launch_server --model-path neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-radix-cache --tp 8 python -m sglang.launch_server --model-path neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-radix-cache --tp 8
......
...@@ -17,7 +17,7 @@ services: ...@@ -17,7 +17,7 @@ services:
# - SGLANG_USE_MODELSCOPE: true # - SGLANG_USE_MODELSCOPE: true
entrypoint: python3 -m sglang.launch_server entrypoint: python3 -m sglang.launch_server
command: command:
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct --model-path meta-llama/Llama-3.1-8B-Instruct
--host 0.0.0.0 --host 0.0.0.0
--port 30000 --port 30000
ulimits: ulimits:
......
...@@ -32,7 +32,7 @@ spec: ...@@ -32,7 +32,7 @@ spec:
ports: ports:
- containerPort: 30000 - containerPort: 30000
command: ["python3", "-m", "sglang.launch_server"] command: ["python3", "-m", "sglang.launch_server"]
args: ["--model-path", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"] args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
env: env:
- name: HF_TOKEN - name: HF_TOKEN
value: <secret> value: <secret>
......
...@@ -30,7 +30,7 @@ apt install nsight-systems-cli ...@@ -30,7 +30,7 @@ apt install nsight-systems-cli
```bash ```bash
# server # server
# set the delay and duration times according to needs # set the delay and duration times according to needs
nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --disable-radix-cache nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
# client # client
python3 -m sglang.bench_serving --backend sglang --num-prompts 6000 --dataset-name random --random-input 4096 --random-output 2048 python3 -m sglang.bench_serving --backend sglang --num-prompts 6000 --dataset-name random --random-input 4096 --random-output 2048
......
...@@ -35,7 +35,7 @@ docker run --gpus all \ ...@@ -35,7 +35,7 @@ docker run --gpus all \
--env "HF_TOKEN=<secret>" \ --env "HF_TOKEN=<secret>" \
--ipc=host \ --ipc=host \
lmsysorg/sglang:latest \ lmsysorg/sglang:latest \
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
``` ```
### Method 4: Using docker compose ### Method 4: Using docker compose
...@@ -75,7 +75,7 @@ resources: ...@@ -75,7 +75,7 @@ resources:
run: | run: |
conda deactivate conda deactivate
python3 -m sglang.launch_server \ python3 -m sglang.launch_server \
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ --model-path meta-llama/Llama-3.1-8B-Instruct \
--host 0.0.0.0 \ --host 0.0.0.0 \
--port 30000 --port 30000
``` ```
......
""" """
Usage: Usage:
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --port 30000
python openai_chat.py python openai_chat.py
""" """
...@@ -10,7 +10,7 @@ from openai import OpenAI ...@@ -10,7 +10,7 @@ from openai import OpenAI
client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY") client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
response = client.chat.completions.create( response = client.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
messages=[ messages=[
{"role": "system", "content": "You are a helpful AI assistant"}, {"role": "system", "content": "You are a helpful AI assistant"},
{ {
......
...@@ -23,13 +23,13 @@ from sglang.srt.utils import kill_child_process ...@@ -23,13 +23,13 @@ from sglang.srt.utils import kill_child_process
from sglang.utils import get_exception_traceback from sglang.utils import get_exception_traceback
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8" DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600 DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
......
...@@ -44,7 +44,7 @@ class ModelCase: ...@@ -44,7 +44,7 @@ class ModelCase:
# Popular models that run on CI # Popular models that run on CI
CI_MODELS = [ CI_MODELS = [
ModelCase("meta-llama/Meta-Llama-3.1-8B-Instruct"), ModelCase("meta-llama/Llama-3.1-8B-Instruct"),
ModelCase("google/gemma-2-2b"), ModelCase("google/gemma-2-2b"),
] ]
......
...@@ -499,7 +499,7 @@ class TestOpenAIServer(unittest.TestCase): ...@@ -499,7 +499,7 @@ class TestOpenAIServer(unittest.TestCase):
client = openai.Client(api_key=self.api_key, base_url=self.base_url) client = openai.Client(api_key=self.api_key, base_url=self.base_url)
response = client.chat.completions.create( response = client.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
messages=[ messages=[
{"role": "system", "content": "You are a helpful AI assistant"}, {"role": "system", "content": "You are a helpful AI assistant"},
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment