Unverified Commit dccf52f9 authored by ryang's avatar ryang Committed by GitHub
Browse files

[UT for RL] Add UT to cover release/resume memory case for moe model (#8803)

parent 676a7b51
...@@ -42,7 +42,8 @@ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct" ...@@ -42,7 +42,8 @@ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct" DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B" DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B" DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
# MLA test models # MLA test models
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct" DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
......
...@@ -8,7 +8,7 @@ import torch ...@@ -8,7 +8,7 @@ import torch
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST, DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
CustomTestCase, CustomTestCase,
......
...@@ -38,6 +38,8 @@ from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGH ...@@ -38,6 +38,8 @@ from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGH
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE, DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
CustomTestCase, CustomTestCase,
) )
...@@ -50,7 +52,7 @@ def get_gpu_memory_gb(): ...@@ -50,7 +52,7 @@ def get_gpu_memory_gb():
class TestReleaseMemoryOccupation(CustomTestCase): class TestReleaseMemoryOccupation(CustomTestCase):
def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1): def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1, ep_size=1):
"""Common setup for engine and HF model.""" """Common setup for engine and HF model."""
engine = sgl.Engine( engine = sgl.Engine(
model_path=model_name, model_path=model_name,
...@@ -58,6 +60,7 @@ class TestReleaseMemoryOccupation(CustomTestCase): ...@@ -58,6 +60,7 @@ class TestReleaseMemoryOccupation(CustomTestCase):
enable_memory_saver=True, enable_memory_saver=True,
mem_fraction_static=mem_fraction_static, mem_fraction_static=mem_fraction_static,
tp_size=tp_size, tp_size=tp_size,
ep_size=ep_size,
# disable_cuda_graph=True, # for debugging only # disable_cuda_graph=True, # for debugging only
) )
...@@ -70,6 +73,10 @@ class TestReleaseMemoryOccupation(CustomTestCase): ...@@ -70,6 +73,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
"sampling_params": {"temperature": 0, "max_new_tokens": 8}, "sampling_params": {"temperature": 0, "max_new_tokens": 8},
"expect_output_before_update_weights": " to spend it outdoors. I decided to", "expect_output_before_update_weights": " to spend it outdoors. I decided to",
"expect_output_after_update_weights": " to go for a walk. I like", "expect_output_after_update_weights": " to go for a walk. I like",
"prompt_moe": "The weather is nice today, and I want to",
"sampling_params_moe": {"temperature": 0, "max_new_tokens": 16},
"expect_output_before_update_weights_moe": " go to the park. I have a picnic basket, a book, and a",
"expect_output_after_update_weights_moe": " go to the park. I have a lot of things to do, but I",
} }
def _test_initial_generation( def _test_initial_generation(
...@@ -250,6 +257,72 @@ class TestReleaseMemoryOccupation(CustomTestCase): ...@@ -250,6 +257,72 @@ class TestReleaseMemoryOccupation(CustomTestCase):
self.assertEqual(outputs, params["expect_output_after_update_weights"]) self.assertEqual(outputs, params["expect_output_after_update_weights"])
engine.shutdown() engine.shutdown()
def test_moe_model_release_and_resume(self):
# Test with MoE model
model_name = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT
tp_size = ep_size = 2
print(
f"Testing tp_size={tp_size} and ep_size={ep_size} for test_moe_model_release_and_resume"
)
engine = sgl.Engine(
model_path=model_name,
random_seed=42,
enable_memory_saver=True,
mem_fraction_static=0.5,
tp_size=tp_size,
ep_size=ep_size,
)
params = self._common_test_params()
self._test_initial_generation(
engine,
params["prompt_moe"],
params["sampling_params_moe"],
params["expect_output_before_update_weights_moe"],
)
t = time.perf_counter()
gpu_memory_usage_before_release = get_gpu_memory_gb()
engine.release_memory_occupation()
gpu_memory_usage_after_release = get_gpu_memory_gb()
self.assertLess(
gpu_memory_usage_after_release,
gpu_memory_usage_before_release,
)
print(
f"Release took {time.perf_counter() - t:.2f}s, memory: {gpu_memory_usage_before_release:.1f} GB → {gpu_memory_usage_after_release:.1f} GB"
)
if _DEBUG_EXTRA:
time.sleep(3)
t = time.perf_counter()
engine.resume_memory_occupation()
print(
f"Resume took {time.perf_counter() - t:.2f}s, memory: {get_gpu_memory_gb():.1f} GB"
)
hf_model_new = AutoModelForCausalLM.from_pretrained(
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
torch_dtype="bfloat16",
device_map="cuda",
)
engine.update_weights_from_tensor(list(hf_model_new.named_parameters()))
# destroy the hf model
del hf_model_new
torch.cuda.empty_cache()
print("generate (#2)")
outputs = engine.generate(params["prompt_moe"], params["sampling_params_moe"])[
"text"
]
self.assertEqual(outputs, params["expect_output_after_update_weights_moe"])
engine.shutdown()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -7,7 +7,7 @@ import requests ...@@ -7,7 +7,7 @@ import requests
from sglang.srt.utils import is_cuda, kill_process_tree from sglang.srt.utils import is_cuda, kill_process_tree
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST, DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
CustomTestCase, CustomTestCase,
...@@ -18,7 +18,7 @@ from sglang.test.test_utils import ( ...@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
class TestTorchCompileMoe(CustomTestCase): class TestTorchCompileMoe(CustomTestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST cls.model = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE
cls.base_url = DEFAULT_URL_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.model,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment