[ROCm] To reduce the compiling time when using torch compile. (#10559)

c0dbbdd1 · sogalin · GitHub · 25e7dbe8 · c0dbbdd1
Unverified Commit c0dbbdd1 authored Oct 02, 2025 by sogalin Committed by GitHub Oct 01, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 1 deletion

python/sglang/srt/model_executor/cuda_graph_runner.py python/sglang/srt/model_executor/cuda_graph_runner.py +5 -1

No files found.
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -53,7 +53,9 @@ from sglang.srt.two_batch_overlap import TboCudaGraphRunnerPlugin
 from sglang.srt.utils import (
    empty_context,
    get_available_gpu_memory,
+    get_bool_env_var,
    get_device_memory_capacity,
+    is_hip,
    log_info_on_rank0,
    require_attn_tp_gather,
    require_gathered_buffer,
@@ -61,6 +63,8 @@ from sglang.srt.utils import (
    require_mlp_tp_gather,
 )

+_is_hip = is_hip()
+
 logger = logging.getLogger(__name__)

 if TYPE_CHECKING:
@@ -137,7 +141,7 @@ def patch_model(
                mode=os.environ.get(
                    "SGLANG_TORCH_COMPILE_MODE", "max-autotune-no-cudagraphs"
                ),
-                dynamic=False,
+                dynamic=_is_hip and get_bool_env_var("SGLANG_TORCH_DYNAMIC_SHAPE"),
            )
        else:
            yield model.forward