"lib/llm/src/vscode:/vscode.git/clone" did not exist on "c7080419679802a40cfb6e4ed805fcf847f35c34"
setup.py 3.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Build script for GPU Memory Service with C++ extensions.

This setup.py builds the C++ extensions as part of pip install.
The _allocator_ext extension only requires Python headers (no CUDA or PyTorch needed).

Following the torch_memory_saver pattern of using pure setuptools for extension building.
"""

from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext


class BuildExtension(build_ext):
    """Custom build extension for C++ modules."""

    def build_extensions(self):
        import os

        # Use CXX environment variable if set, otherwise default to g++
        cxx = os.environ.get("CXX", "g++")
        self.compiler.set_executable("compiler_so", cxx)
        self.compiler.set_executable("compiler_cxx", cxx)
        self.compiler.set_executable("linker_so", f"{cxx} -shared")

        build_ext.build_extensions(self)


def _create_ext_modules():
    """Create extension modules for gpu_memory_service."""
    # Common compile arguments
    extra_compile_args = ["-std=c++17", "-O3", "-fPIC"]

    # _allocator_ext: CUDAPluggableAllocator shim using only Python C API
    # No CUDA or PyTorch dependency - just provides my_malloc/my_free that call Python callbacks
    return [
        Extension(
            name="gpu_memory_service.client.torch.extensions._allocator_ext",
            sources=["client/torch/extensions/allocator.cpp"],
            extra_compile_args=extra_compile_args,
        )
    ]


setup(
    name="gpu-memory-service",
    version="0.8.0",
    description="GPU Memory Service for Dynamo - CUDA VMM-based GPU memory allocation and sharing",
    author="NVIDIA Inc.",
    author_email="sw-dl-dynamo@nvidia.com",
    license="Apache-2.0",
    python_requires=">=3.10",
    install_requires=[
        "msgpack>=1.0",
        "uvloop>=0.21.0",
    ],
    extras_require={
        "test": [
            "pytest>=8.3.4",
            "pytest-asyncio",
        ],
    },
    # Package directory mapping: the current directory IS the gpu_memory_service package
    packages=[
        "gpu_memory_service",
        "gpu_memory_service.common",
        "gpu_memory_service.common.protocol",
        "gpu_memory_service.server",
        "gpu_memory_service.client",
        "gpu_memory_service.client.torch",
        "gpu_memory_service.client.torch.extensions",
74
        "gpu_memory_service.vllm_integration",
75
76
77
78
79
80
81
82
83
    ],
    package_dir={
        "gpu_memory_service": ".",
        "gpu_memory_service.common": "common",
        "gpu_memory_service.common.protocol": "common/protocol",
        "gpu_memory_service.server": "server",
        "gpu_memory_service.client": "client",
        "gpu_memory_service.client.torch": "client/torch",
        "gpu_memory_service.client.torch.extensions": "client/torch/extensions",
84
        "gpu_memory_service.vllm_integration": "vllm_integration",
85
86
87
88
89
90
91
    },
    package_data={
        "gpu_memory_service.client.torch.extensions": ["*.cpp"],
    },
    ext_modules=_create_ext_modules(),
    cmdclass={"build_ext": BuildExtension},
)