refactor: simplify GPU Memory Service integrations and module boundaries (#7875)

39a6a240 · Schwinn Saereesitthipitak · GitHub · 02666f04 · 39a6a240 · 39a6a240
Unverified Commit 39a6a240 authored Apr 09, 2026 by Schwinn Saereesitthipitak Committed by GitHub Apr 09, 2026
11 changed files
--- a/tests/gms/common/test_gms_sglang_patches.py
+++ b/tests/gms/common/test_gms_sglang_patches.py
@@ -6,13 +6,16 @@ from __future__ import annotations
 import sys
 import types

+import gpu_memory_service.integrations.sglang.patches as sglang_patches
 import pytest
-from gpu_memory_service.integrations.sglang import patches as sglang_patches
+
+torch = pytest.importorskip("torch", reason="torch is required")

 pytestmark = [
    pytest.mark.pre_merge,
    pytest.mark.unit,
    pytest.mark.gpu_0,
+    pytest.mark.sglang,
 ]


@@ -37,8 +40,7 @@ def test_patch_model_runner_rewrites_total_gpu_memory(monkeypatch):
    )

    class FakeImpl:
-        def get_imported_weights_bytes(self):
-            return 8 << 30
+        imported_weights_bytes = 8 << 30

    fake_memory_saver.get_gms_memory_saver_impl = lambda: FakeImpl()

@@ -55,6 +57,11 @@ def test_patch_model_runner_rewrites_total_gpu_memory(monkeypatch):
        "gpu_memory_service.integrations.sglang.memory_saver",
        fake_memory_saver,
    )
+    monkeypatch.setattr(
+        sglang_patches,
+        "get_gms_memory_saver_impl",
+        lambda: FakeImpl(),
+    )
    monkeypatch.setattr(sglang_patches, "_model_runner_patched", False)
    monkeypatch.delattr(FakeModelRunner, "_gms_patched", raising=False)
    monkeypatch.setattr(

--- a/tests/gms/common/test_gms_torch_allocator.py
+++ b/tests/gms/common/test_gms_torch_allocator.py
@@ -5,11 +5,12 @@ from __future__ import annotations

 import pytest
 from gpu_memory_service.client.torch import allocator as allocator_module
-from gpu_memory_service.common.types import GrantedLockType, RequestedLockType
+from gpu_memory_service.common.locks import GrantedLockType, RequestedLockType

 pytestmark = [
    pytest.mark.pre_merge,
    pytest.mark.unit,
+    pytest.mark.none,
    pytest.mark.gpu_0,
 ]


--- a/tests/gms/conftest.py
+++ b/tests/gms/conftest.py
@@ -5,16 +5,7 @@

 import pytest

-# Skip collection entirely if gpu_memory_service is not installed.
-# This package lives under nested common/ and integration/ subdirectories, so
-# we ignore those directories directly instead of only matching test files next
-# to this conftest.
-try:
-    import gpu_memory_service  # noqa: F401
-except ImportError:
-    collect_ignore = ["common", "integration"]
-
-from tests.utils.port_utils import allocate_port, deallocate_ports
+from tests.utils.port_utils import allocate_port, deallocate_ports  # noqa: E402


 @pytest.fixture

--- a/tests/gms/harness/external_weight_writer.py
+++ b/tests/gms/harness/external_weight_writer.py
@@ -10,11 +10,13 @@ import subprocess
 from contextlib import contextmanager

 import torch
-from gpu_memory_service import get_or_create_gms_client_memory_manager
 from gpu_memory_service.client.memory_manager import GMSClientMemoryManager
-from gpu_memory_service.client.torch.allocator import gms_use_mem_pool
+from gpu_memory_service.client.torch.allocator import (
+    get_or_create_gms_client_memory_manager,
+    gms_use_mem_pool,
+)
 from gpu_memory_service.client.torch.module import register_module_tensors
-from gpu_memory_service.common.types import RequestedLockType
+from gpu_memory_service.common.locks import RequestedLockType
 from gpu_memory_service.common.utils import get_socket_path

 from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME

--- a/tests/gms/harness/runtime.py
+++ b/tests/gms/harness/runtime.py
@@ -7,7 +7,6 @@ import logging
 import time
 from pathlib import Path

-import pynvml
 import requests

 from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
@@ -19,6 +18,8 @@ MIN_EXPECTED_MEMORY_RETURN_FRACTION = 0.6


 def get_gpu_memory_used(device: int = 0) -> int:
+    import pynvml
+
    pynvml.nvmlInit()
    try:
        handle = pynvml.nvmlDeviceGetHandleByIndex(device)

--- a/tests/gms/integration/__init__.py
+++ b/tests/gms/integration/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+pytest.importorskip("gpu_memory_service", reason="gpu_memory_service is required")
--- a/tests/gms/integration/test_external_weight_mgr.py
+++ b/tests/gms/integration/test_external_weight_mgr.py
@@ -10,7 +10,8 @@ from typing import Callable, Protocol

 import pytest
 from gpu_memory_service.client.session import _GMSClientSession
-from gpu_memory_service.common.types import RequestedLockType, ServerState
+from gpu_memory_service.common.locks import RequestedLockType
+from gpu_memory_service.server.fsm import ServerState

 from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
 from tests.utils.managed_process import DynamoFrontendProcess

--- a/tests/gms/integration/test_gms_shadow_failover.py
+++ b/tests/gms/integration/test_gms_shadow_failover.py
@@ -12,7 +12,7 @@ from contextlib import ExitStack
 from typing import Callable

 import pytest
-from gpu_memory_service.common.types import ServerState
+from gpu_memory_service.server.fsm import ServerState

 from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
 from tests.utils.managed_process import DynamoFrontendProcess, ManagedProcess

--- a/tests/gms/integration/test_gms_sleep_wake.py
+++ b/tests/gms/integration/test_gms_sleep_wake.py
@@ -9,7 +9,7 @@ from contextlib import ExitStack
 from typing import Callable

 import pytest
-from gpu_memory_service.common.types import ServerState
+from gpu_memory_service.server.fsm import ServerState

 from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
 from tests.utils.managed_process import DynamoFrontendProcess, ManagedProcess

--- a/tests/gms/integration/test_gms_torch_integration.py
+++ b/tests/gms/integration/test_gms_torch_integration.py
@@ -12,7 +12,7 @@ from gpu_memory_service.client.torch.module import (
    register_module_tensors,
 )
 from gpu_memory_service.client.torch.tensor import _tensor_from_pointer
-from gpu_memory_service.common.types import RequestedLockType
+from gpu_memory_service.common.locks import RequestedLockType

 from tests.gms.harness.gms import GMSServerProcess

@@ -21,6 +21,7 @@ torch = pytest.importorskip("torch", reason="torch is required")
 pytestmark = [
    pytest.mark.pre_merge,
    pytest.mark.unit,
+    pytest.mark.none,
    pytest.mark.gpu_1,
 ]


--- a/tests/report_pytest_markers.py
+++ b/tests/report_pytest_markers.py
@@ -129,6 +129,7 @@ STUB_MODULES = [
    "gpu_memory_service.client.torch.module",
    "gpu_memory_service.client.torch.tensor",
    "gpu_memory_service.common",
+    "gpu_memory_service.common.locks",
    "gpu_memory_service.common.cuda_utils",
    "gpu_memory_service.common.protocol",
    "gpu_memory_service.common.protocol.messages",
@@ -141,11 +142,13 @@ STUB_MODULES = [
    "gpu_memory_service.integrations.common",
    "gpu_memory_service.integrations.common.utils",
    "gpu_memory_service.integrations.sglang",
+    "gpu_memory_service.integrations.sglang.patches",
    "gpu_memory_service.integrations.sglang.memory_saver",
    "gpu_memory_service.integrations.vllm",
    "gpu_memory_service.integrations.vllm.worker",
    "gpu_memory_service.server",
    "gpu_memory_service.server.allocations",
+    "gpu_memory_service.server.fsm",
    "gpu_memory_service.server.gms",
    "gpu_memory_service.server.rpc",
    "gpu_memory_service.server.session",