[Auto Sync] Update scheduler_profiler_mixin.py, rpd_utils.p... (20250916) (#10494)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: cctry <shiyang@x.ai>

[Auto Sync] Update scheduler_profiler_mixin.py, rpd_utils.p... (20250916) (#10494)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: cctry <shiyang@x.ai>
c49484a6 · Lianmin Zheng · GitHub · a2f7218a · c49484a6 · c49484a6
Unverified Commit c49484a6 authored Sep 16, 2025 by Lianmin Zheng Committed by GitHub Sep 16, 2025
4 changed files
--- a/python/sglang/srt/managers/scheduler_profiler_mixin.py
+++ b/python/sglang/srt/managers/scheduler_profiler_mixin.py
@@ -204,7 +204,7 @@ class SchedulerProfilerMixin:

            torch.distributed.barrier(self.tp_cpu_group)
            if self.tp_rank == 0:
-                from sglang.srt.utils import rpd_to_chrome_trace
+                from sglang.srt.rpd_utils import rpd_to_chrome_trace

                rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
            self.rpd_profiler = None

--- a/python/sglang/srt/rpd_utils.py
+++ b/python/sglang/srt/rpd_utils.py
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -752,6 +752,25 @@ def load_image(
    return image, image_size


+def get_image_bytes(image_file: Union[str, bytes]):
+    if isinstance(image_file, bytes):
+        return image_file
+    elif image_file.startswith("http://") or image_file.startswith("https://"):
+        timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
+        response = requests.get(image_file, timeout=timeout)
+        return response.content
+    elif image_file.lower().endswith(("png", "jpg", "jpeg", "webp", "gif")):
+        with open(image_file, "rb") as f:
+            return f.read()
+    elif image_file.startswith("data:"):
+        image_file = image_file.split(",")[1]
+        return pybase64.b64decode(image_file)
+    elif isinstance(image_file, str):
+        return pybase64.b64decode(image_file)
+    else:
+        raise NotImplementedError(f"Invalid image: {image_file}")
+
+
 def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
    # We import decord here to avoid a strange Segmentation fault (core dumped) issue.
    from decord import VideoReader, cpu, gpu
@@ -807,6 +826,33 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
            os.unlink(tmp_file.name)


+def encode_video(video_path, frame_count_limit=None):
+    # Lazy import because decord is not available on some arm platforms.
+    from decord import VideoReader, cpu
+
+    if not os.path.exists(video_path):
+        logger.error(f"Video {video_path} does not exist")
+        return []
+
+    if frame_count_limit == 0:
+        return []
+
+    def uniform_sample(l, n):
+        gap = len(l) / n
+        idxs = [int(i * gap + gap / 2) for i in range(n)]
+        return [l[i] for i in idxs]
+
+    vr = VideoReader(video_path, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_indices = [i for i in range(0, len(vr), sample_fps)]
+    if frame_count_limit is not None and len(frame_indices) > frame_count_limit:
+        frame_indices = uniform_sample(frame_indices, frame_count_limit)
+
+    frames = vr.get_batch(frame_indices).asnumpy()
+    frames = [Image.fromarray(v.astype("uint8")) for v in frames]
+    return frames
+
+
 def suppress_other_loggers():
    warnings.filterwarnings(
        "ignore", category=UserWarning, message="The given NumPy array is not writable"
@@ -949,6 +995,13 @@ def set_ulimit(target_soft_limit=65535):
            logger.warning(f"Fail to set RLIMIT_STACK: {e}")


+def rank0_log(msg: str):
+    from sglang.srt.distributed import get_tensor_model_parallel_rank
+
+    if get_tensor_model_parallel_rank() == 0:
+        logger.info(msg)
+
+
 def add_api_key_middleware(app, api_key: str):
    @app.middleware("http")
    async def authentication(request, call_next):
@@ -3045,6 +3098,44 @@ def check_cuda_result(raw_output):
    return results


+def get_physical_device_id(pytorch_device_id: int) -> int:
+    """
+    Convert PyTorch logical device ID to physical device ID.
+    """
+    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    assert (
+        cuda_visible_devices is not None
+    ), "CUDA_VISIBLE_DEVICES should be set in a scheduler"
+    device_list = cuda_visible_devices.split(",")
+    assert (
+        len(device_list) == 1
+    ), "CUDA_VISIBLE_DEVICES should be set to a single device in a scheduler"
+    return int(device_list[0])
+
+
+def get_device_sm_nvidia_smi():
+    try:
+        # Run nvidia-smi command and capture output
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+
+        # Get the first line of output (assuming at least one GPU exists)
+        compute_cap_str = result.stdout.strip().split("\n")[0]
+
+        # Convert string (e.g., "9.0") to tuple of integers (9, 0)
+        major, minor = map(int, compute_cap_str.split("."))
+        return (major, minor)
+
+    except (subprocess.CalledProcessError, FileNotFoundError, ValueError) as e:
+        # Handle cases where nvidia-smi isn't available or output is unexpected
+        print(f"Error getting compute capability: {e}")
+        return (0, 0)  # Default/fallback value
+
+
 def numa_bind_to_node(node: int):
    libnuma = ctypes.CDLL("libnuma.so")
    if libnuma.numa_available() < 0:
@@ -3061,3 +3152,33 @@ def json_list_type(value):
        raise argparse.ArgumentTypeError(
            f"Invalid JSON list: {value}. Please provide a valid JSON list."
        )
+
+
+@contextmanager
+def temp_set_cuda_visible_devices(gpu_id: int):
+    original_cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+    if original_cuda_visible_devices:
+        cuda_visible_devices = original_cuda_visible_devices.split(",")
+    else:
+        cuda_visible_devices = []
+
+    str_gpu_id = cuda_visible_devices[gpu_id] if cuda_visible_devices else str(gpu_id)
+    os.environ["CUDA_VISIBLE_DEVICES"] = str_gpu_id
+    yield
+    if original_cuda_visible_devices:
+        os.environ["CUDA_VISIBLE_DEVICES"] = original_cuda_visible_devices
+    else:
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+
+
+def get_extend_input_len_swa_limit(
+    sliding_window_size: int, chunked_prefill_size: int, page_size: int
+) -> int:
+    # 1. a factor of 2x is because each prefill contains chunked_prefill_size tokens,
+    #    and between prefills, we run swa_radix_cache.cache_unfinished_req(),
+    #    so we unlock the previously locked nodes.
+    # 2. max is to handle the case that chunked_prefill_size is larger than sliding_window_size.
+    #    in that case, each prefill contains chunked_prefill_size tokens,
+    #    and we can only free out-of-sliding-window kv indices after each prefill.
+    # 3. page_size is because we want to have 1 token extra for generated tokens.
+    return page_size + 2 * max(sliding_window_size, chunked_prefill_size)
--- a/python/sglang/srt/warmup.py
+++ b/python/sglang/srt/warmup.py
+from __future__ import annotations
+
 import logging
-from typing import List
+from typing import TYPE_CHECKING, List

 import numpy as np
 import tqdm

 from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
 from sglang.srt.managers.io_struct import GenerateReqInput
-from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager

 logger = logging.getLogger(__file__)

 _warmup_registry = {}


-def warmup(name: str) -> callable:
-    def decorator(fn: callable):
+def warmup(name: str):
+    def decorator(fn):
        _warmup_registry[name] = fn
        return fn