Tiltfile 11.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Tiltfile for developing the Dynamo Kubernetes Operator.
#
# Usage:
#   cd deploy/operator
#   # edit tilt-settings.yaml as needed
#   tilt up
#
# What it does:
#   1. Compiles the Go manager binary locally (fast, native).
#   2. Builds a minimal Docker image containing only the binary.
#   3. Renders the production Helm chart (deploy/helm/charts/platform) with
#      `helm template`, applies CRDs via kubectl, and deploys all rendered
#      resources via k8s_yaml().
#   4. On code change Tilt recompiles the binary and live-updates it into the
#      running container — no full image rebuild needed.
#
20
# Prerequisites:
21
22
23
24
25
#   - Go >= 1.25        — compiles the manager binary locally
#   - tilt              — live-reload orchestrator (https://docs.tilt.dev/install.html)
#   - docker            — builds the live-update container image
#   - A Kubernetes cluster reachable via your current kubeconfig context
#
26
27
28
29
# Auto-downloaded into ./bin/ on first run (via `make kubectl helm`):
#   - helm v3           — renders the platform Helm chart
#   - kubectl >= 1.29   — applies CRDs and creates the namespace
#
30
# Conditional (only when skip_codegen is false, the default):
31
#   - yq v4+            — post-processes generated CRD YAML (run `make yq`)
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#   - python3 + pydantic — generates Pydantic models from Go types
#
# The tilt restart_process extension is auto-fetched on first `tilt up`.

load('ext://restart_process', 'docker_build_with_restart')

# ---------------------------------------------------------------------------
# Settings — defaults are defined here; tilt-settings.yaml overrides them.
# ---------------------------------------------------------------------------
settings = {
    'namespace':            'dynamo-system',
    'enable_nats':          True,       # required for DGD/DGDR workloads
    'enable_etcd':          False,      # only if discoveryBackend is "etcd"
    'enable_kai_scheduler': False,      # GPU-aware scheduling for multi-node
    'enable_grove':         False,      # PodClique-based multi-node orchestration
    'skip_codegen':         False,      # skip make generate/manifests for faster iteration
    'image_pull_secret':    '',         # name of docker-registry Secret for private registries
    'helm_values':          {},         # extra --set overrides passed to helm template
}
if os.path.exists('tilt-settings.yaml'):
    data = read_yaml('tilt-settings.yaml', default={})
    if data:
        settings.update(data)

if 'allowed_contexts' in settings:
    allow_k8s_contexts(settings['allowed_contexts'])

# Registry — resolved from (highest priority wins):
#   1. REGISTRY env var          (e.g. REGISTRY=docker.io/myuser tilt up)
#   2. "registry" in tilt-settings.yaml
# When set the operator image is pushed as <registry>/controller:tilt-dev.
REGISTRY = os.getenv('REGISTRY', settings.get('registry', ''))
if REGISTRY:
    REGISTRY = REGISTRY.rstrip('/')

NAMESPACE            = settings['namespace']
HELM_VALUES          = settings['helm_values']
ENABLE_NATS          = settings['enable_nats']
ENABLE_ETCD          = settings['enable_etcd']
ENABLE_KAI_SCHEDULER = settings['enable_kai_scheduler']
ENABLE_GROVE         = settings['enable_grove']
IMAGE_PULL_SECRET    = settings['image_pull_secret']

# ---------------------------------------------------------------------------
# Operator version — passed as --operator-version to the manager binary.
# The Helm chart uses .Chart.AppVersion; for Tilt dev we read it from the
# operator subchart's Chart.yaml so it stays in sync automatically.
# Override via tilt-settings.yaml if needed:
#
#   tilt-settings.yaml:
#     operator_version: "1.2.3"
# ---------------------------------------------------------------------------
def _read_chart_app_version():
    """Read appVersion from the operator subchart's Chart.yaml."""
    chart_path = os.path.join(
        os.getcwd(), '..', 'helm', 'charts', 'platform',
        'components', 'operator', 'Chart.yaml')
    if os.path.exists(chart_path):
        chart = read_yaml(chart_path, default={})
        if chart and 'appVersion' in chart:
            return str(chart['appVersion'])
    return '0.0.0-dev'

OPERATOR_VERSION = settings.get('operator_version', _read_chart_app_version())

# ---------------------------------------------------------------------------
# Paths (relative to this Tiltfile, i.e. deploy/operator/)
# ---------------------------------------------------------------------------
OPERATOR_DIR = os.getcwd()                                     # deploy/operator
HELM_CHART   = os.path.join(OPERATOR_DIR, '..', 'helm', 'charts', 'platform')  # deploy/helm/charts/platform
CRD_DIR      = os.path.join(HELM_CHART, 'components', 'operator', 'crds')

104
105
106
107
108
109
110
111
112
113
114
115
# ---------------------------------------------------------------------------
# Local tool binaries — ensure kubectl and helm are downloaded via `make`
# so that all invocations use the pinned, version-suffixed copies in ./bin/.
# The Makefile download targets create unversioned symlinks (e.g. bin/kubectl)
# so we can reference them by their short names here.
# ---------------------------------------------------------------------------
TOOLS_BIN_DIR = os.path.join(OPERATOR_DIR, 'bin')
local('make --directory="%s" kubectl helm' % OPERATOR_DIR, quiet=True)

kubectl_cmd = os.path.join(TOOLS_BIN_DIR, 'kubectl')
helm_cmd    = os.path.join(TOOLS_BIN_DIR, 'helm')

116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
IMG_NAME = 'controller'
IMG_TAG  = 'tilt-dev'
IMG      = (REGISTRY + '/' + IMG_NAME) if REGISTRY else IMG_NAME
IMG_REF  = IMG + ':' + IMG_TAG

# ---------------------------------------------------------------------------
# Compile the manager binary locally (much faster than building in Docker)
# ---------------------------------------------------------------------------
def compile_manager():
    return 'CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o tilt_bin/manager ./cmd/main.go'

local_resource(
    'manager-build',
    compile_manager(),
    deps=[
        'api/',
        'cmd/',
        'internal/',
        'go.mod',
        'go.sum',
    ],
    ignore=['**/zz_generated.deepcopy.go'],
    labels=['operator'],
)

# ---------------------------------------------------------------------------
# CRDs — regenerate & apply via server-side apply on change
# ---------------------------------------------------------------------------
SKIP_CODEGEN = settings['skip_codegen']
145
_crd_cmd = kubectl_cmd + ' apply --server-side --force-conflicts -f ' + CRD_DIR
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
if not SKIP_CODEGEN:
    _crd_cmd = 'make generate && make manifests && ' + _crd_cmd

local_resource(
    'crds',
    _crd_cmd,
    deps=['api/'],
    ignore=['**/zz_generated.deepcopy.go'],
    labels=['operator'],
)

# ---------------------------------------------------------------------------
# Helm template → k8s_yaml
#
# Renders the production Helm chart (deploy/helm/charts/platform) with the
# operator and required infrastructure (NATS by default). This gives you a
# fully working cluster where you can apply DGDR/DGD resources and have them
# reconcile into real workloads on your GPU nodes — while live-reloading the
# controller binary on every code change.
#
# The chart has no Helm hooks — webhook certificates, CA bundle injection,
# and MPI SSH key generation are all handled by the operator binary at
# runtime (auto mode).
# ---------------------------------------------------------------------------
def render_helm():
    """Render the platform Helm chart with only the operator subchart enabled."""

173
174
    helm_template_cmd = [
        helm_cmd, 'template', 'dynamo', HELM_CHART,
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
        '--namespace', NAMESPACE,
        '--set', 'dynamo-operator.enabled=true',
        # Subcharts — NATS is on by default (workers need it)
        '--set', 'nats.enabled=%s' % str(ENABLE_NATS).lower(),
        '--set', 'dynamo-operator.nats.enabled=%s' % str(ENABLE_NATS).lower(),
        '--set', 'global.etcd.install=%s' % str(ENABLE_ETCD).lower(),
        '--set', 'global.kai-scheduler.install=%s' % str(ENABLE_KAI_SCHEDULER).lower(),
        '--set', 'global.grove.install=%s' % str(ENABLE_GROVE).lower(),
        # Point image at our Tilt-managed image
        '--set', 'dynamo-operator.controllerManager.manager.image.repository=' + IMG,
        '--set', 'dynamo-operator.controllerManager.manager.image.tag=' + IMG_TAG,
        '--set', 'dynamo-operator.controllerManager.manager.image.pullPolicy=IfNotPresent',
        # We apply CRDs ourselves in the local_resource above
        '--set', 'dynamo-operator.upgradeCRD=false',
        '--skip-crds',
    ]

    # Wire in imagePullSecrets when a pull secret is configured
    if IMAGE_PULL_SECRET:
194
        helm_template_cmd += ['--set', 'dynamo-operator.imagePullSecrets[0].name=' + IMAGE_PULL_SECRET]
195
196
197

    # Append user-provided Helm overrides from tilt-settings
    for k, v in HELM_VALUES.items():
198
        helm_template_cmd += ['--set', '%s=%s' % (k, v)]
199

200
    data = local(helm_template_cmd, quiet=True)
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233

    # Decode the YAML stream so we can patch individual documents
    decoded = decode_yaml_stream(data)
    patched = []
    for doc in decoded:
        if doc == None:
            continue

        # Ensure namespaced resources land in the target namespace.
        # Cluster-scoped kinds must not have a namespace set.
        _cluster_scoped_kinds = [
            'ClusterRole', 'ClusterRoleBinding',
            'ValidatingWebhookConfiguration', 'MutatingWebhookConfiguration',
            'CustomResourceDefinition', 'Namespace',
            'PriorityClass', 'StorageClass', 'IngressClass',
        ]
        kind = doc.get('kind', '')
        if 'metadata' in doc and 'namespace' not in doc['metadata'] and kind not in _cluster_scoped_kinds:
            doc['metadata']['namespace'] = NAMESPACE

        # Strip securityContext so Tilt's live_update (writing into the
        # container as root) doesn't get blocked by non-root restrictions.
        if doc.get('kind') == 'Deployment':
            spec = doc.get('spec', {}).get('template', {}).get('spec', {})
            spec.pop('securityContext', None)
            for c in spec.get('containers', []):
                c.pop('securityContext', None)

        patched.append(doc)

    return encode_yaml_stream(patched)

# Create the namespace before applying anything else
234
local(kubectl_cmd + ' create namespace %s || true' % NAMESPACE, quiet=True)
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283

k8s_yaml(render_helm())

# ---------------------------------------------------------------------------
# Docker image — minimal container with just the compiled binary
# ---------------------------------------------------------------------------
DOCKERFILE = '''
FROM alpine:3.20 AS base
RUN apk add --no-cache ca-certificates
FROM base
WORKDIR /
COPY ./tilt_bin/manager /manager
COPY ./tilt_bin/manager /workspace/manager
ENTRYPOINT ["/manager"]
'''

docker_build_with_restart(
    IMG_REF,
    context='.',
    dockerfile_contents=DOCKERFILE,
    entrypoint=['/manager', '--config=/etc/dynamo-operator/config.yaml',
                '--operator-version=' + OPERATOR_VERSION],
    only=['./tilt_bin/manager'],
    live_update=[
        sync('./tilt_bin/manager', '/manager'),
    ],
)

if not REGISTRY:
    print('WARNING: no registry configured — image will only be available locally.')
    print('  Set "registry" in tilt-settings.yaml or pass REGISTRY env var.')

# ---------------------------------------------------------------------------
# Resource grouping — keep the Tilt UI tidy
# ---------------------------------------------------------------------------
k8s_resource(
    workload='dynamo-dynamo-operator-controller-manager',
    new_name='operator',
    labels=['operator'],
    port_forwards=['8081:8081'],  # health endpoint
    resource_deps=['crds', 'manager-build'],
)

# Group subchart workloads in the Tilt UI
if ENABLE_NATS:
    k8s_resource(
        workload='dynamo-nats',
        labels=['infrastructure'],
    )