Unverified Commit 35ca04d2 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[CI] fix port conflicts (#5789)

parent 3c4e0ee6
...@@ -54,7 +54,7 @@ jobs: ...@@ -54,7 +54,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
part: [0, 1, 2, 3, 4, 5, 6] part: [0, 1, 2, 3, 4, 5, 6, 7]
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
...@@ -64,10 +64,10 @@ jobs: ...@@ -64,10 +64,10 @@ jobs:
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
- name: Run test - name: Run test
timeout-minutes: 40 timeout-minutes: 30
run: | run: |
cd test/srt cd test/srt
python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 7 python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 8
unit-test-backend-2-gpu: unit-test-backend-2-gpu:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
......
...@@ -977,6 +977,7 @@ async def benchmark( ...@@ -977,6 +977,7 @@ async def benchmark(
profile: bool, profile: bool,
pd_seperated: bool = False, pd_seperated: bool = False,
flush_cache: bool = False, flush_cache: bool = False,
warmup_requests: int = 1,
): ):
if backend in ASYNC_REQUEST_FUNCS: if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend] request_func = ASYNC_REQUEST_FUNCS[backend]
...@@ -993,10 +994,8 @@ async def benchmark( ...@@ -993,10 +994,8 @@ async def benchmark(
async with semaphore: async with semaphore:
return await request_func(request_func_input=request_func_input, pbar=pbar) return await request_func(request_func_input=request_func_input, pbar=pbar)
if not hasattr(args, "warmup_requests"):
args.warmup_requests = 1
# Warmup # Warmup
print(f"Starting warmup with {args.warmup_requests} sequences...") print(f"Starting warmup with {warmup_requests} sequences...")
# Use the first request for all warmup iterations # Use the first request for all warmup iterations
test_prompt, test_prompt_len, test_output_len = input_requests[0] test_prompt, test_prompt_len, test_output_len = input_requests[0]
...@@ -1018,7 +1017,7 @@ async def benchmark( ...@@ -1018,7 +1017,7 @@ async def benchmark(
# Run warmup requests # Run warmup requests
warmup_tasks = [] warmup_tasks = []
for _ in range(args.warmup_requests): for _ in range(warmup_requests):
warmup_tasks.append( warmup_tasks.append(
asyncio.create_task(request_func(request_func_input=test_input)) asyncio.create_task(request_func(request_func_input=test_input))
) )
...@@ -1026,9 +1025,7 @@ async def benchmark( ...@@ -1026,9 +1025,7 @@ async def benchmark(
warmup_outputs = await asyncio.gather(*warmup_tasks) warmup_outputs = await asyncio.gather(*warmup_tasks)
# Check if at least one warmup request succeeded # Check if at least one warmup request succeeded
if args.warmup_requests > 0 and not any( if warmup_requests > 0 and not any(output.success for output in warmup_outputs):
output.success for output in warmup_outputs
):
raise ValueError( raise ValueError(
"Warmup failed - Please make sure benchmark arguments " "Warmup failed - Please make sure benchmark arguments "
f"are correctly specified. Error: {warmup_outputs[0].error}" f"are correctly specified. Error: {warmup_outputs[0].error}"
......
...@@ -281,7 +281,9 @@ async def generate_from_file_request(file: UploadFile, request: Request): ...@@ -281,7 +281,9 @@ async def generate_from_file_request(file: UploadFile, request: Request):
) )
try: try:
ret = await _global_state.generate_request(obj, request).__anext__() ret = await _global_state.tokenizer_manager.generate_request(
obj, request
).__anext__()
return ret return ret
except ValueError as e: except ValueError as e:
logger.error(f"Error: {e}") logger.error(f"Error: {e}")
......
...@@ -8,7 +8,6 @@ import random ...@@ -8,7 +8,6 @@ import random
import subprocess import subprocess
import threading import threading
import time import time
import traceback
import unittest import unittest
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
......
...@@ -14,7 +14,7 @@ class TestFile: ...@@ -14,7 +14,7 @@ class TestFile:
suites = { suites = {
"per-commit": [ "per-commit": [
TestFile("models/lora/test_lora.py", 76), TestFile("models/lora/test_lora.py", 76),
TestFile("models/lora/test_lora_backend.py", 420), TestFile("models/lora/test_lora_backend.py", 99),
TestFile("models/lora/test_multi_lora_backend.py", 60), TestFile("models/lora/test_multi_lora_backend.py", 60),
TestFile("models/test_embedding_models.py", 35), TestFile("models/test_embedding_models.py", 35),
TestFile("models/test_generation_models.py", 103), TestFile("models/test_generation_models.py", 103),
...@@ -23,30 +23,30 @@ suites = { ...@@ -23,30 +23,30 @@ suites = {
TestFile("models/test_compressed_tensors_models.py", 100), TestFile("models/test_compressed_tensors_models.py", 100),
TestFile("models/test_reward_models.py", 83), TestFile("models/test_reward_models.py", 83),
TestFile("models/test_gme_qwen_models.py", 45), TestFile("models/test_gme_qwen_models.py", 45),
TestFile("models/test_clip_models.py", 100), TestFile("models/test_clip_models.py", 52),
TestFile("models/test_vlm_models.py", 100), TestFile("models/test_vlm_models.py", 581),
TestFile("test_abort.py", 51), TestFile("test_abort.py", 51),
TestFile("test_block_int8.py", 22), TestFile("test_block_int8.py", 22),
TestFile("test_chunked_prefill.py", 336), TestFile("test_chunked_prefill.py", 285),
TestFile("test_eagle_infer.py", 500), TestFile("test_eagle_infer.py", 584),
TestFile("test_ebnf_constrained.py"), TestFile("test_ebnf_constrained.py"),
TestFile("test_fa3.py", 400), TestFile("test_fa3.py", 376),
TestFile("test_fp8_kernel.py", 8), TestFile("test_fp8_kernel.py", 8),
TestFile("test_embedding_openai_server.py", 36), TestFile("test_embedding_openai_server.py", 141),
TestFile("test_hidden_states.py", 55), TestFile("test_hidden_states.py", 55),
TestFile("test_int8_kernel.py", 8), TestFile("test_int8_kernel.py", 8),
TestFile("test_input_embeddings.py", 38), TestFile("test_input_embeddings.py", 38),
TestFile("test_json_constrained.py", 98), TestFile("test_json_constrained.py", 98),
TestFile("test_large_max_new_tokens.py", 41), TestFile("test_large_max_new_tokens.py", 41),
TestFile("test_metrics.py", 32), TestFile("test_metrics.py", 32),
TestFile("test_mla.py", 162), TestFile("test_mla.py", 242),
TestFile("test_mla_deepseek_v3.py", 221), TestFile("test_mla_deepseek_v3.py", 221),
TestFile("test_mla_int8_deepseek_v3.py", 522), TestFile("test_mla_int8_deepseek_v3.py", 674),
TestFile("test_mla_flashinfer.py", 395), TestFile("test_mla_flashinfer.py", 395),
TestFile("test_mla_fp8.py", 93), TestFile("test_mla_fp8.py", 153),
TestFile("test_no_chunked_prefill.py", 126), TestFile("test_no_chunked_prefill.py", 126),
TestFile("test_no_overlap_scheduler.py", 262), TestFile("test_no_overlap_scheduler.py", 262),
TestFile("test_openai_server.py", 186), TestFile("test_openai_server.py", 149),
TestFile("test_penalty.py", 41), TestFile("test_penalty.py", 41),
TestFile("test_page_size.py", 60), TestFile("test_page_size.py", 60),
TestFile("test_pytorch_sampling_backend.py", 66), TestFile("test_pytorch_sampling_backend.py", 66),
...@@ -57,11 +57,11 @@ suites = { ...@@ -57,11 +57,11 @@ suites = {
TestFile("test_request_length_validation.py", 31), TestFile("test_request_length_validation.py", 31),
TestFile("test_retract_decode.py", 54), TestFile("test_retract_decode.py", 54),
TestFile("test_server_args.py", 1), TestFile("test_server_args.py", 1),
TestFile("test_skip_tokenizer_init.py", 72), TestFile("test_skip_tokenizer_init.py", 117),
TestFile("test_srt_engine.py", 237), TestFile("test_srt_engine.py", 237),
TestFile("test_srt_endpoint.py", 94), TestFile("test_srt_endpoint.py", 94),
TestFile("test_torch_compile.py", 76), TestFile("test_torch_compile.py", 76),
TestFile("test_torch_compile_moe.py", 85), TestFile("test_torch_compile_moe.py", 235),
TestFile("test_torch_native_attention_backend.py", 123), TestFile("test_torch_native_attention_backend.py", 123),
TestFile("test_torchao.py", 70), TestFile("test_torchao.py", 70),
TestFile("test_triton_attention_kernels.py", 4), TestFile("test_triton_attention_kernels.py", 4),
...@@ -69,27 +69,27 @@ suites = { ...@@ -69,27 +69,27 @@ suites = {
TestFile("test_update_weights_from_disk.py", 114), TestFile("test_update_weights_from_disk.py", 114),
TestFile("test_update_weights_from_tensor.py", 48), TestFile("test_update_weights_from_tensor.py", 48),
TestFile("test_vertex_endpoint.py", 31), TestFile("test_vertex_endpoint.py", 31),
TestFile("test_vision_chunked_prefill.py", 99), TestFile("test_vision_chunked_prefill.py", 119),
TestFile("test_vlm_accuracy.py", 60), TestFile("test_vlm_accuracy.py", 60),
TestFile("test_vision_openai_server.py", 537), TestFile("test_vision_openai_server.py", 637),
TestFile("test_fim_completion.py", 40), TestFile("test_fim_completion.py", 40),
TestFile("test_w8a8_quantization.py", 46), TestFile("test_w8a8_quantization.py", 46),
TestFile("test_eval_fp8_accuracy.py", 303), TestFile("test_eval_fp8_accuracy.py", 303),
TestFile("test_create_kvindices.py", 2), TestFile("test_create_kvindices.py", 2),
TestFile("test_hicache.py", 60), TestFile("test_hicache.py", 116),
TestFile("test_hicache_mla.py", 90), TestFile("test_hicache_mla.py", 254),
TestFile("test_fused_moe.py", 30), TestFile("test_fused_moe.py", 30),
TestFile("test_triton_moe_channel_fp8_kernel.py", 25), TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
], ],
"per-commit-2-gpu": [ "per-commit-2-gpu": [
TestFile("models/lora/test_lora_tp.py", 150), TestFile("models/lora/test_lora_tp.py", 116),
TestFile("test_data_parallelism.py", 90), TestFile("test_data_parallelism.py", 73),
TestFile("test_dp_attention.py", 150), TestFile("test_dp_attention.py", 137),
TestFile("test_mla_tp.py", 174), TestFile("test_mla_tp.py", 170),
TestFile("test_moe_ep.py", 220), TestFile("test_moe_ep.py", 181),
TestFile("test_patch_torch.py", 30), TestFile("test_patch_torch.py", 19),
TestFile("test_update_weights_from_distributed.py", 100), TestFile("test_update_weights_from_distributed.py", 103),
TestFile("test_verl_engine.py", 100), TestFile("test_verl_engine.py", 64),
], ],
"per-commit-8-gpu": [ "per-commit-8-gpu": [
TestFile("test_local_attn.py", 250), TestFile("test_local_attn.py", 250),
......
...@@ -24,7 +24,7 @@ class TestTorchCompileMoe(CustomTestCase): ...@@ -24,7 +24,7 @@ class TestTorchCompileMoe(CustomTestCase):
cls.model, cls.model,
cls.base_url, cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=["--enable-torch-compile", "--torch-compile-max-bs", "8"], other_args=["--enable-torch-compile", "--torch-compile-max-bs", "4"],
) )
@classmethod @classmethod
......
...@@ -129,7 +129,7 @@ def init_process_hf( ...@@ -129,7 +129,7 @@ def init_process_hf(
hf_instruct_params = [] hf_instruct_params = []
hf_base_params = [] hf_base_params = []
print("get parameter in hf instruct model and base model") print("[hf] get parameter in hf instruct model and base model")
for parameter_name in checking_parameters: for parameter_name in checking_parameters:
hf_instruct_params.append( hf_instruct_params.append(
hf_instruct_model.get_parameter(parameter_name)[:truncate_size] hf_instruct_model.get_parameter(parameter_name)[:truncate_size]
...@@ -152,10 +152,12 @@ def init_process_hf( ...@@ -152,10 +152,12 @@ def init_process_hf(
param_queue.put(("hf_base_params", hf_base_params)) param_queue.put(("hf_base_params", hf_base_params))
# Init weight update group for rank 0 (the training engine in RLHF). # Init weight update group for rank 0 (the training engine in RLHF).
print(f"rank {rank} world_size: {world_size} init custom process group") port = 60000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
init_method = f"tcp://localhost:{port}"
print(f"[hf] {rank=} {world_size=} init custom process group. {init_method=}")
group = init_custom_process_group( group = init_custom_process_group(
backend="nccl", backend="nccl",
init_method="tcp://localhost:65500", init_method=init_method,
world_size=world_size, world_size=world_size,
rank=rank, rank=rank,
group_name="test_parameter_update_group", group_name="test_parameter_update_group",
...@@ -184,7 +186,7 @@ def init_process_hf( ...@@ -184,7 +186,7 @@ def init_process_hf(
# Measure the latency of broadcasting/weights update. # Measure the latency of broadcasting/weights update.
broadcast_time = time_end_broadcast - time_begin_broadcast broadcast_time = time_end_broadcast - time_begin_broadcast
print(f"rank {rank} broadcast parameter time: {broadcast_time:.3f}s") print(f"[hf] {rank=} {broadcast_time=:.3f}s")
param_queue.put(("broadcast_time", broadcast_time)) param_queue.put(("broadcast_time", broadcast_time))
# Delete the huggingface models to free up memory. # Delete the huggingface models to free up memory.
...@@ -210,17 +212,21 @@ def init_process_sgl( ...@@ -210,17 +212,21 @@ def init_process_sgl(
torch.cuda.synchronize() torch.cuda.synchronize()
base_gpu_id = 1 if rank == 1 else 1 + tp_size base_gpu_id = 1 if rank == 1 else 1 + tp_size
if backend == "Engine": if backend == "Engine":
print(f"[sgl] rank {rank} init engine")
engine = sgl.Engine( engine = sgl.Engine(
model_path=model_name, model_path=model_name,
random_seed=42,
base_gpu_id=base_gpu_id, base_gpu_id=base_gpu_id,
tp_size=tp_size, tp_size=tp_size,
cuda_graph_max_bs=2,
) )
else: else:
if rank == 1: if rank == 1:
url = DEFAULT_URL_FOR_TEST url = DEFAULT_URL_FOR_TEST
else: else:
url = DEFAULT_URL_FOR_TEST.replace("2157", "2159") host, port = DEFAULT_URL_FOR_TEST.split(":")
url = ":".join(host, str(int(port) + 10000))
print(f"[sgl] rank {rank} init server on url: {url}")
process = popen_launch_server( process = popen_launch_server(
model_name, model_name,
url, url,
...@@ -230,13 +236,11 @@ def init_process_sgl( ...@@ -230,13 +236,11 @@ def init_process_sgl(
str(base_gpu_id), str(base_gpu_id),
"--tp-size", "--tp-size",
str(tp_size), str(tp_size),
"--cuda-graph-max-bs",
2,
), ),
) )
torch.cuda.synchronize() torch.cuda.synchronize()
if backend == "Engine":
print(f"rank {rank} init engine")
else:
print(f"rank {rank} init server on url: {url}")
# Get weights of instruct model, i.e. pre-training weights. # Get weights of instruct model, i.e. pre-training weights.
instruct_params = [] instruct_params = []
...@@ -252,11 +256,13 @@ def init_process_sgl( ...@@ -252,11 +256,13 @@ def init_process_sgl(
param_queue.put((f"sgl_dp_{rank}_instruct_params", instruct_params)) param_queue.put((f"sgl_dp_{rank}_instruct_params", instruct_params))
port = 60000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
# Init weight update group with the training engine. # Init weight update group with the training engine.
if backend == "Engine": if backend == "Engine":
engine.init_weights_update_group( engine.init_weights_update_group(
master_address="localhost", master_address="localhost",
master_port="65500", master_port=str(port),
rank_offset=base_gpu_id, rank_offset=base_gpu_id,
world_size=world_size, world_size=world_size,
group_name="test_parameter_update_group", group_name="test_parameter_update_group",
...@@ -267,7 +273,7 @@ def init_process_sgl( ...@@ -267,7 +273,7 @@ def init_process_sgl(
f"{url}/init_weights_update_group", f"{url}/init_weights_update_group",
json={ json={
"master_address": "localhost", "master_address": "localhost",
"master_port": "65500", "master_port": str(port),
"rank_offset": base_gpu_id, "rank_offset": base_gpu_id,
"world_size": world_size, "world_size": world_size,
"group_name": "test_parameter_update_group", "group_name": "test_parameter_update_group",
...@@ -311,7 +317,7 @@ def init_process_sgl( ...@@ -311,7 +317,7 @@ def init_process_sgl(
# Measure the latency of broadcast/weights update. # Measure the latency of broadcast/weights update.
update_time = time_end_update - time_begin_update update_time = time_end_update - time_begin_update
print( print(
f"fully update model_name {model_name} rank {rank} parameter from distributed time: {update_time:.3f}s" f"[sgl] fully update model_name {model_name} rank {rank} parameter from distributed time: {update_time:.3f}s"
) )
param_queue.put((f"update_sgl_dp_{rank}_time", update_time)) param_queue.put((f"update_sgl_dp_{rank}_time", update_time))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment