Tiltfile 11 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Tiltfile for developing the Dynamo Kubernetes Operator.
#
# Usage:
#   cd deploy/operator
#   # edit tilt-settings.yaml as needed
#   tilt up
#
# What it does:
#   1. Compiles the Go manager binary locally (fast, native).
#   2. Builds a minimal Docker image containing only the binary.
#   3. Renders the production Helm chart (deploy/helm/charts/platform) with
#      `helm template`, applies CRDs via kubectl, and deploys all rendered
#      resources via k8s_yaml().
#   4. On code change Tilt recompiles the binary and live-updates it into the
#      running container — no full image rebuild needed.
#
# Prerequisites (must be in PATH):
#   - Go >= 1.25        — compiles the manager binary locally
#   - tilt              — live-reload orchestrator (https://docs.tilt.dev/install.html)
#   - helm v3           — renders the platform Helm chart
#   - kubectl >= 1.29   — applies CRDs and creates the namespace
#   - docker            — builds the live-update container image
#   - A Kubernetes cluster reachable via your current kubeconfig context
#
# Conditional (only when skip_codegen is false, the default):
#   - yq v4+            — post-processes generated CRD YAML (run `make ensure-yq`)
#   - python3 + pydantic — generates Pydantic models from Go types
#
# The tilt restart_process extension is auto-fetched on first `tilt up`.

load('ext://restart_process', 'docker_build_with_restart')

# ---------------------------------------------------------------------------
# Settings — defaults are defined here; tilt-settings.yaml overrides them.
# ---------------------------------------------------------------------------
settings = {
    'namespace':            'dynamo-system',
    'enable_nats':          True,       # required for DGD/DGDR workloads
    'enable_etcd':          False,      # only if discoveryBackend is "etcd"
    'enable_kai_scheduler': False,      # GPU-aware scheduling for multi-node
    'enable_grove':         False,      # PodClique-based multi-node orchestration
    'skip_codegen':         False,      # skip make generate/manifests for faster iteration
    'image_pull_secret':    '',         # name of docker-registry Secret for private registries
    'helm_values':          {},         # extra --set overrides passed to helm template
}
if os.path.exists('tilt-settings.yaml'):
    data = read_yaml('tilt-settings.yaml', default={})
    if data:
        settings.update(data)

if 'allowed_contexts' in settings:
    allow_k8s_contexts(settings['allowed_contexts'])

# Registry — resolved from (highest priority wins):
#   1. REGISTRY env var          (e.g. REGISTRY=docker.io/myuser tilt up)
#   2. "registry" in tilt-settings.yaml
# When set the operator image is pushed as <registry>/controller:tilt-dev.
REGISTRY = os.getenv('REGISTRY', settings.get('registry', ''))
if REGISTRY:
    REGISTRY = REGISTRY.rstrip('/')

NAMESPACE            = settings['namespace']
HELM_VALUES          = settings['helm_values']
ENABLE_NATS          = settings['enable_nats']
ENABLE_ETCD          = settings['enable_etcd']
ENABLE_KAI_SCHEDULER = settings['enable_kai_scheduler']
ENABLE_GROVE         = settings['enable_grove']
IMAGE_PULL_SECRET    = settings['image_pull_secret']

# ---------------------------------------------------------------------------
# Operator version — passed as --operator-version to the manager binary.
# The Helm chart uses .Chart.AppVersion; for Tilt dev we read it from the
# operator subchart's Chart.yaml so it stays in sync automatically.
# Override via tilt-settings.yaml if needed:
#
#   tilt-settings.yaml:
#     operator_version: "1.2.3"
# ---------------------------------------------------------------------------
def _read_chart_app_version():
    """Read appVersion from the operator subchart's Chart.yaml."""
    chart_path = os.path.join(
        os.getcwd(), '..', 'helm', 'charts', 'platform',
        'components', 'operator', 'Chart.yaml')
    if os.path.exists(chart_path):
        chart = read_yaml(chart_path, default={})
        if chart and 'appVersion' in chart:
            return str(chart['appVersion'])
    return '0.0.0-dev'

OPERATOR_VERSION = settings.get('operator_version', _read_chart_app_version())

# ---------------------------------------------------------------------------
# Paths (relative to this Tiltfile, i.e. deploy/operator/)
# ---------------------------------------------------------------------------
OPERATOR_DIR = os.getcwd()                                     # deploy/operator
HELM_CHART   = os.path.join(OPERATOR_DIR, '..', 'helm', 'charts', 'platform')  # deploy/helm/charts/platform
CRD_DIR      = os.path.join(HELM_CHART, 'components', 'operator', 'crds')

IMG_NAME = 'controller'
IMG_TAG  = 'tilt-dev'
IMG      = (REGISTRY + '/' + IMG_NAME) if REGISTRY else IMG_NAME
IMG_REF  = IMG + ':' + IMG_TAG

# ---------------------------------------------------------------------------
# Compile the manager binary locally (much faster than building in Docker)
# ---------------------------------------------------------------------------
def compile_manager():
    return 'CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o tilt_bin/manager ./cmd/main.go'

local_resource(
    'manager-build',
    compile_manager(),
    deps=[
        'api/',
        'cmd/',
        'internal/',
        'go.mod',
        'go.sum',
    ],
    ignore=['**/zz_generated.deepcopy.go'],
    labels=['operator'],
)

# ---------------------------------------------------------------------------
# CRDs — regenerate & apply via server-side apply on change
# ---------------------------------------------------------------------------
SKIP_CODEGEN = settings['skip_codegen']
_crd_cmd = 'kubectl apply --server-side --force-conflicts -f ' + CRD_DIR
if not SKIP_CODEGEN:
    _crd_cmd = 'make generate && make manifests && ' + _crd_cmd

local_resource(
    'crds',
    _crd_cmd,
    deps=['api/'],
    ignore=['**/zz_generated.deepcopy.go'],
    labels=['operator'],
)

# ---------------------------------------------------------------------------
# Helm template → k8s_yaml
#
# Renders the production Helm chart (deploy/helm/charts/platform) with the
# operator and required infrastructure (NATS by default). This gives you a
# fully working cluster where you can apply DGDR/DGD resources and have them
# reconcile into real workloads on your GPU nodes — while live-reloading the
# controller binary on every code change.
#
# The chart has no Helm hooks — webhook certificates, CA bundle injection,
# and MPI SSH key generation are all handled by the operator binary at
# runtime (auto mode).
# ---------------------------------------------------------------------------
def render_helm():
    """Render the platform Helm chart with only the operator subchart enabled."""

    helm_cmd = [
        'helm', 'template', 'dynamo', HELM_CHART,
        '--namespace', NAMESPACE,
        '--set', 'dynamo-operator.enabled=true',
        # Subcharts — NATS is on by default (workers need it)
        '--set', 'nats.enabled=%s' % str(ENABLE_NATS).lower(),
        '--set', 'dynamo-operator.nats.enabled=%s' % str(ENABLE_NATS).lower(),
        '--set', 'global.etcd.install=%s' % str(ENABLE_ETCD).lower(),
        '--set', 'global.kai-scheduler.install=%s' % str(ENABLE_KAI_SCHEDULER).lower(),
        '--set', 'global.grove.install=%s' % str(ENABLE_GROVE).lower(),
        # Point image at our Tilt-managed image
        '--set', 'dynamo-operator.controllerManager.manager.image.repository=' + IMG,
        '--set', 'dynamo-operator.controllerManager.manager.image.tag=' + IMG_TAG,
        '--set', 'dynamo-operator.controllerManager.manager.image.pullPolicy=IfNotPresent',
        # We apply CRDs ourselves in the local_resource above
        '--set', 'dynamo-operator.upgradeCRD=false',
        '--skip-crds',
    ]

    # Wire in imagePullSecrets when a pull secret is configured
    if IMAGE_PULL_SECRET:
        helm_cmd += ['--set', 'dynamo-operator.imagePullSecrets[0].name=' + IMAGE_PULL_SECRET]

    # Append user-provided Helm overrides from tilt-settings
    for k, v in HELM_VALUES.items():
        helm_cmd += ['--set', '%s=%s' % (k, v)]

    data = local(helm_cmd, quiet=True)

    # Decode the YAML stream so we can patch individual documents
    decoded = decode_yaml_stream(data)
    patched = []
    for doc in decoded:
        if doc == None:
            continue

        # Ensure namespaced resources land in the target namespace.
        # Cluster-scoped kinds must not have a namespace set.
        _cluster_scoped_kinds = [
            'ClusterRole', 'ClusterRoleBinding',
            'ValidatingWebhookConfiguration', 'MutatingWebhookConfiguration',
            'CustomResourceDefinition', 'Namespace',
            'PriorityClass', 'StorageClass', 'IngressClass',
        ]
        kind = doc.get('kind', '')
        if 'metadata' in doc and 'namespace' not in doc['metadata'] and kind not in _cluster_scoped_kinds:
            doc['metadata']['namespace'] = NAMESPACE

        # Strip securityContext so Tilt's live_update (writing into the
        # container as root) doesn't get blocked by non-root restrictions.
        if doc.get('kind') == 'Deployment':
            spec = doc.get('spec', {}).get('template', {}).get('spec', {})
            spec.pop('securityContext', None)
            for c in spec.get('containers', []):
                c.pop('securityContext', None)

        patched.append(doc)

    return encode_yaml_stream(patched)

# Create the namespace before applying anything else
local('kubectl create namespace %s || true' % NAMESPACE, quiet=True)

k8s_yaml(render_helm())

# ---------------------------------------------------------------------------
# Docker image — minimal container with just the compiled binary
# ---------------------------------------------------------------------------
DOCKERFILE = '''
FROM alpine:3.20 AS base
RUN apk add --no-cache ca-certificates
FROM base
WORKDIR /
COPY ./tilt_bin/manager /manager
COPY ./tilt_bin/manager /workspace/manager
ENTRYPOINT ["/manager"]
'''

docker_build_with_restart(
    IMG_REF,
    context='.',
    dockerfile_contents=DOCKERFILE,
    entrypoint=['/manager', '--config=/etc/dynamo-operator/config.yaml',
                '--operator-version=' + OPERATOR_VERSION],
    only=['./tilt_bin/manager'],
    live_update=[
        sync('./tilt_bin/manager', '/manager'),
    ],
)

if not REGISTRY:
    print('WARNING: no registry configured — image will only be available locally.')
    print('  Set "registry" in tilt-settings.yaml or pass REGISTRY env var.')

# ---------------------------------------------------------------------------
# Resource grouping — keep the Tilt UI tidy
# ---------------------------------------------------------------------------
k8s_resource(
    workload='dynamo-dynamo-operator-controller-manager',
    new_name='operator',
    labels=['operator'],
    port_forwards=['8081:8081'],  # health endpoint
    resource_deps=['crds', 'manager-build'],
)

# Group subchart workloads in the Tilt UI
if ENABLE_NATS:
    k8s_resource(
        workload='dynamo-nats',
        labels=['infrastructure'],
    )