Unverified Commit 39a6a240 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor: simplify GPU Memory Service integrations and module boundaries (#7875)

parent 02666f04
...@@ -6,13 +6,16 @@ from __future__ import annotations ...@@ -6,13 +6,16 @@ from __future__ import annotations
import sys import sys
import types import types
import gpu_memory_service.integrations.sglang.patches as sglang_patches
import pytest import pytest
from gpu_memory_service.integrations.sglang import patches as sglang_patches
torch = pytest.importorskip("torch", reason="torch is required")
pytestmark = [ pytestmark = [
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.unit, pytest.mark.unit,
pytest.mark.gpu_0, pytest.mark.gpu_0,
pytest.mark.sglang,
] ]
...@@ -37,8 +40,7 @@ def test_patch_model_runner_rewrites_total_gpu_memory(monkeypatch): ...@@ -37,8 +40,7 @@ def test_patch_model_runner_rewrites_total_gpu_memory(monkeypatch):
) )
class FakeImpl: class FakeImpl:
def get_imported_weights_bytes(self): imported_weights_bytes = 8 << 30
return 8 << 30
fake_memory_saver.get_gms_memory_saver_impl = lambda: FakeImpl() fake_memory_saver.get_gms_memory_saver_impl = lambda: FakeImpl()
...@@ -55,6 +57,11 @@ def test_patch_model_runner_rewrites_total_gpu_memory(monkeypatch): ...@@ -55,6 +57,11 @@ def test_patch_model_runner_rewrites_total_gpu_memory(monkeypatch):
"gpu_memory_service.integrations.sglang.memory_saver", "gpu_memory_service.integrations.sglang.memory_saver",
fake_memory_saver, fake_memory_saver,
) )
monkeypatch.setattr(
sglang_patches,
"get_gms_memory_saver_impl",
lambda: FakeImpl(),
)
monkeypatch.setattr(sglang_patches, "_model_runner_patched", False) monkeypatch.setattr(sglang_patches, "_model_runner_patched", False)
monkeypatch.delattr(FakeModelRunner, "_gms_patched", raising=False) monkeypatch.delattr(FakeModelRunner, "_gms_patched", raising=False)
monkeypatch.setattr( monkeypatch.setattr(
......
...@@ -5,11 +5,12 @@ from __future__ import annotations ...@@ -5,11 +5,12 @@ from __future__ import annotations
import pytest import pytest
from gpu_memory_service.client.torch import allocator as allocator_module from gpu_memory_service.client.torch import allocator as allocator_module
from gpu_memory_service.common.types import GrantedLockType, RequestedLockType from gpu_memory_service.common.locks import GrantedLockType, RequestedLockType
pytestmark = [ pytestmark = [
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.unit, pytest.mark.unit,
pytest.mark.none,
pytest.mark.gpu_0, pytest.mark.gpu_0,
] ]
......
...@@ -5,16 +5,7 @@ ...@@ -5,16 +5,7 @@
import pytest import pytest
# Skip collection entirely if gpu_memory_service is not installed. from tests.utils.port_utils import allocate_port, deallocate_ports # noqa: E402
# This package lives under nested common/ and integration/ subdirectories, so
# we ignore those directories directly instead of only matching test files next
# to this conftest.
try:
import gpu_memory_service # noqa: F401
except ImportError:
collect_ignore = ["common", "integration"]
from tests.utils.port_utils import allocate_port, deallocate_ports
@pytest.fixture @pytest.fixture
......
...@@ -10,11 +10,13 @@ import subprocess ...@@ -10,11 +10,13 @@ import subprocess
from contextlib import contextmanager from contextlib import contextmanager
import torch import torch
from gpu_memory_service import get_or_create_gms_client_memory_manager
from gpu_memory_service.client.memory_manager import GMSClientMemoryManager from gpu_memory_service.client.memory_manager import GMSClientMemoryManager
from gpu_memory_service.client.torch.allocator import gms_use_mem_pool from gpu_memory_service.client.torch.allocator import (
get_or_create_gms_client_memory_manager,
gms_use_mem_pool,
)
from gpu_memory_service.client.torch.module import register_module_tensors from gpu_memory_service.client.torch.module import register_module_tensors
from gpu_memory_service.common.types import RequestedLockType from gpu_memory_service.common.locks import RequestedLockType
from gpu_memory_service.common.utils import get_socket_path from gpu_memory_service.common.utils import get_socket_path
from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
......
...@@ -7,7 +7,6 @@ import logging ...@@ -7,7 +7,6 @@ import logging
import time import time
from pathlib import Path from pathlib import Path
import pynvml
import requests import requests
from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
...@@ -19,6 +18,8 @@ MIN_EXPECTED_MEMORY_RETURN_FRACTION = 0.6 ...@@ -19,6 +18,8 @@ MIN_EXPECTED_MEMORY_RETURN_FRACTION = 0.6
def get_gpu_memory_used(device: int = 0) -> int: def get_gpu_memory_used(device: int = 0) -> int:
import pynvml
pynvml.nvmlInit() pynvml.nvmlInit()
try: try:
handle = pynvml.nvmlDeviceGetHandleByIndex(device) handle = pynvml.nvmlDeviceGetHandleByIndex(device)
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import pytest
pytest.importorskip("gpu_memory_service", reason="gpu_memory_service is required")
...@@ -10,7 +10,8 @@ from typing import Callable, Protocol ...@@ -10,7 +10,8 @@ from typing import Callable, Protocol
import pytest import pytest
from gpu_memory_service.client.session import _GMSClientSession from gpu_memory_service.client.session import _GMSClientSession
from gpu_memory_service.common.types import RequestedLockType, ServerState from gpu_memory_service.common.locks import RequestedLockType
from gpu_memory_service.server.fsm import ServerState
from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
from tests.utils.managed_process import DynamoFrontendProcess from tests.utils.managed_process import DynamoFrontendProcess
......
...@@ -12,7 +12,7 @@ from contextlib import ExitStack ...@@ -12,7 +12,7 @@ from contextlib import ExitStack
from typing import Callable from typing import Callable
import pytest import pytest
from gpu_memory_service.common.types import ServerState from gpu_memory_service.server.fsm import ServerState
from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
from tests.utils.managed_process import DynamoFrontendProcess, ManagedProcess from tests.utils.managed_process import DynamoFrontendProcess, ManagedProcess
......
...@@ -9,7 +9,7 @@ from contextlib import ExitStack ...@@ -9,7 +9,7 @@ from contextlib import ExitStack
from typing import Callable from typing import Callable
import pytest import pytest
from gpu_memory_service.common.types import ServerState from gpu_memory_service.server.fsm import ServerState
from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
from tests.utils.managed_process import DynamoFrontendProcess, ManagedProcess from tests.utils.managed_process import DynamoFrontendProcess, ManagedProcess
......
...@@ -12,7 +12,7 @@ from gpu_memory_service.client.torch.module import ( ...@@ -12,7 +12,7 @@ from gpu_memory_service.client.torch.module import (
register_module_tensors, register_module_tensors,
) )
from gpu_memory_service.client.torch.tensor import _tensor_from_pointer from gpu_memory_service.client.torch.tensor import _tensor_from_pointer
from gpu_memory_service.common.types import RequestedLockType from gpu_memory_service.common.locks import RequestedLockType
from tests.gms.harness.gms import GMSServerProcess from tests.gms.harness.gms import GMSServerProcess
...@@ -21,6 +21,7 @@ torch = pytest.importorskip("torch", reason="torch is required") ...@@ -21,6 +21,7 @@ torch = pytest.importorskip("torch", reason="torch is required")
pytestmark = [ pytestmark = [
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.unit, pytest.mark.unit,
pytest.mark.none,
pytest.mark.gpu_1, pytest.mark.gpu_1,
] ]
......
...@@ -129,6 +129,7 @@ STUB_MODULES = [ ...@@ -129,6 +129,7 @@ STUB_MODULES = [
"gpu_memory_service.client.torch.module", "gpu_memory_service.client.torch.module",
"gpu_memory_service.client.torch.tensor", "gpu_memory_service.client.torch.tensor",
"gpu_memory_service.common", "gpu_memory_service.common",
"gpu_memory_service.common.locks",
"gpu_memory_service.common.cuda_utils", "gpu_memory_service.common.cuda_utils",
"gpu_memory_service.common.protocol", "gpu_memory_service.common.protocol",
"gpu_memory_service.common.protocol.messages", "gpu_memory_service.common.protocol.messages",
...@@ -141,11 +142,13 @@ STUB_MODULES = [ ...@@ -141,11 +142,13 @@ STUB_MODULES = [
"gpu_memory_service.integrations.common", "gpu_memory_service.integrations.common",
"gpu_memory_service.integrations.common.utils", "gpu_memory_service.integrations.common.utils",
"gpu_memory_service.integrations.sglang", "gpu_memory_service.integrations.sglang",
"gpu_memory_service.integrations.sglang.patches",
"gpu_memory_service.integrations.sglang.memory_saver", "gpu_memory_service.integrations.sglang.memory_saver",
"gpu_memory_service.integrations.vllm", "gpu_memory_service.integrations.vllm",
"gpu_memory_service.integrations.vllm.worker", "gpu_memory_service.integrations.vllm.worker",
"gpu_memory_service.server", "gpu_memory_service.server",
"gpu_memory_service.server.allocations", "gpu_memory_service.server.allocations",
"gpu_memory_service.server.fsm",
"gpu_memory_service.server.gms", "gpu_memory_service.server.gms",
"gpu_memory_service.server.rpc", "gpu_memory_service.server.rpc",
"gpu_memory_service.server.session", "gpu_memory_service.server.session",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment