Unverified Commit dbdbd5e5 authored by julienmancuso's avatar julienmancuso Committed by GitHub
Browse files

feat: remove proxy side car (#822)

parent 21e97b0d
......@@ -28,7 +28,6 @@ stringData:
DYNAMO_IMAGE_BUILDER_NAMESPACE: {{ .Release.Namespace }}
INTERNAL_IMAGES_DEBUGGER: {{ .Values.dynamo.internalImages.debugger | quote }}
INTERNAL_IMAGES_PROXY: {{ .Values.dynamo.internalImages.proxy | quote }}
{{- if .Values.dynamo.enableRestrictedSecurityContext }}
ENABLE_RESTRICTED_SECURITY_CONTEXT: "true"
......
......@@ -83,7 +83,6 @@ dynamo:
buildkit: moby/buildkit:v0.20.2
buildkitRootless: moby/buildkit:v0.20.2-rootless
debugger: python:3.12-slim
proxy: envoyproxy/envoy:v1.33-latest
enableRestrictedSecurityContext: false
......
......@@ -46,7 +46,6 @@ dynamo-operator:
buildkit: moby/buildkit:v0.20.2
buildkitRootless: moby/buildkit:v0.20.2-rootless
debugger: python:3.12-slim
proxy: envoyproxy/envoy:v1.33-latest
enableRestrictedSecurityContext: false
dockerRegistry:
server: ""
......
......@@ -42,7 +42,6 @@ import (
commonconsts "github.com/ai-dynamo/dynamo/deploy/dynamo/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/dynamo/operator/internal/controller_common"
commonController "github.com/ai-dynamo/dynamo/deploy/dynamo/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/dynamo/operator/internal/envoy"
"github.com/cisco-open/k8s-objectmatcher/patch"
"github.com/huandu/xstrings"
istioNetworking "istio.io/api/networking/v1beta1"
......@@ -75,20 +74,12 @@ const (
KubeAnnotationEnableStealingTrafficDebugMode = "nvidia.com/enable-stealing-traffic-debug-mode"
KubeAnnotationEnableDebugMode = "nvidia.com/enable-debug-mode"
KubeAnnotationEnableDebugPodReceiveProductionTraffic = "nvidia.com/enable-debug-pod-receive-production-traffic"
KubeAnnotationProxySidecarResourcesLimitsCPU = "nvidia.com/proxy-sidecar-resources-limits-cpu"
KubeAnnotationProxySidecarResourcesLimitsMemory = "nvidia.com/proxy-sidecar-resources-limits-memory"
KubeAnnotationProxySidecarResourcesRequestsCPU = "nvidia.com/proxy-sidecar-resources-requests-cpu"
KubeAnnotationProxySidecarResourcesRequestsMemory = "nvidia.com/proxy-sidecar-resources-requests-memory"
DeploymentTargetTypeProduction = "production"
DeploymentTargetTypeDebug = "debug"
ContainerPortNameHTTPProxy = "http-proxy"
ServicePortNameHTTPNonProxy = "http-non-proxy"
HeaderNameDebug = "X-Nvidia-Debug"
DefaultIngressSuffix = "local"
)
var ServicePortHTTPNonProxy = commonconsts.DynamoServicePort + 1
// DynamoComponentDeploymentReconciler reconciles a DynamoComponentDeployment object
type DynamoComponentDeploymentReconciler struct {
client.Client
......@@ -1136,9 +1127,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
kubeName := r.getKubeName(opt.dynamoComponentDeployment, opt.dynamoComponent, opt.isStealingTrafficDebugModeEnabled)
containerPort := commonconsts.DynamoServicePort
lastPort := containerPort + 1
lastPort++
var envs []corev1.EnvVar
envsSeen := make(map[string]struct{})
......@@ -1388,175 +1376,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
containers = append(containers, container)
lastPort++
proxyPort := lastPort
proxyResourcesRequestsCPUStr := resourceAnnotations[KubeAnnotationProxySidecarResourcesRequestsCPU]
if proxyResourcesRequestsCPUStr == "" {
proxyResourcesRequestsCPUStr = "100m"
}
var proxyResourcesRequestsCPU resource.Quantity
proxyResourcesRequestsCPU, err = resource.ParseQuantity(proxyResourcesRequestsCPUStr)
if err != nil {
err = errors.Wrapf(err, "failed to parse proxy sidecar resources requests cpu: %s", proxyResourcesRequestsCPUStr)
return nil, err
}
proxyResourcesRequestsMemoryStr := resourceAnnotations[KubeAnnotationProxySidecarResourcesRequestsMemory]
if proxyResourcesRequestsMemoryStr == "" {
proxyResourcesRequestsMemoryStr = "200Mi"
}
var proxyResourcesRequestsMemory resource.Quantity
proxyResourcesRequestsMemory, err = resource.ParseQuantity(proxyResourcesRequestsMemoryStr)
if err != nil {
err = errors.Wrapf(err, "failed to parse proxy sidecar resources requests memory: %s", proxyResourcesRequestsMemoryStr)
return nil, err
}
proxyResourcesLimitsCPUStr := resourceAnnotations[KubeAnnotationProxySidecarResourcesLimitsCPU]
if proxyResourcesLimitsCPUStr == "" {
proxyResourcesLimitsCPUStr = "300m"
}
var proxyResourcesLimitsCPU resource.Quantity
proxyResourcesLimitsCPU, err = resource.ParseQuantity(proxyResourcesLimitsCPUStr)
if err != nil {
err = errors.Wrapf(err, "failed to parse proxy sidecar resources limits cpu: %s", proxyResourcesLimitsCPUStr)
return nil, err
}
proxyResourcesLimitsMemoryStr := resourceAnnotations[KubeAnnotationProxySidecarResourcesLimitsMemory]
if proxyResourcesLimitsMemoryStr == "" {
proxyResourcesLimitsMemoryStr = "1000Mi"
}
var proxyResourcesLimitsMemory resource.Quantity
proxyResourcesLimitsMemory, err = resource.ParseQuantity(proxyResourcesLimitsMemoryStr)
if err != nil {
err = errors.Wrapf(err, "failed to parse proxy sidecar resources limits memory: %s", proxyResourcesLimitsMemoryStr)
return nil, err
}
var envoyConfigContent string
if opt.isStealingTrafficDebugModeEnabled {
productionServiceName := r.getServiceName(opt.dynamoComponentDeployment, opt.dynamoComponent, false)
envoyConfigContent, err = envoy.GenerateEnvoyConfigurationContent(envoy.CreateEnvoyConfig{
ListenPort: proxyPort,
DebugHeaderName: HeaderNameDebug,
DebugHeaderValue: commonconsts.KubeLabelValueTrue,
DebugServerAddress: "localhost",
DebugServerPort: containerPort,
ProductionServerAddress: fmt.Sprintf("%s.%s.svc.cluster.local", productionServiceName, opt.dynamoComponentDeployment.Namespace),
ProductionServerPort: ServicePortHTTPNonProxy,
})
} else {
debugServiceName := r.getServiceName(opt.dynamoComponentDeployment, opt.dynamoComponent, true)
envoyConfigContent, err = envoy.GenerateEnvoyConfigurationContent(envoy.CreateEnvoyConfig{
ListenPort: proxyPort,
DebugHeaderName: HeaderNameDebug,
DebugHeaderValue: commonconsts.KubeLabelValueTrue,
DebugServerAddress: fmt.Sprintf("%s.%s.svc.cluster.local", debugServiceName, opt.dynamoComponentDeployment.Namespace),
DebugServerPort: ServicePortHTTPNonProxy,
ProductionServerAddress: "localhost",
ProductionServerPort: containerPort,
})
}
if err != nil {
err = errors.Wrapf(err, "failed to generate envoy configuration content")
return nil, err
}
envoyConfigConfigMapName := fmt.Sprintf("%s-envoy-config", kubeName)
envoyConfigConfigMap := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: envoyConfigConfigMapName,
Namespace: opt.dynamoComponentDeployment.Namespace,
},
Data: map[string]string{
"envoy.yaml": envoyConfigContent,
},
}
err = ctrl.SetControllerReference(opt.dynamoComponentDeployment, envoyConfigConfigMap, r.Scheme)
if err != nil {
err = errors.Wrapf(err, "failed to set controller reference for envoy config config map")
return nil, err
}
_, err = ctrl.CreateOrUpdate(ctx, r.Client, envoyConfigConfigMap, func() error {
envoyConfigConfigMap.Data["envoy.yaml"] = envoyConfigContent
return nil
})
if err != nil {
err = errors.Wrapf(err, "failed to create or update envoy config configmap")
return nil, err
}
volumes = append(volumes, corev1.Volume{
Name: "envoy-config",
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: envoyConfigConfigMapName,
},
},
},
})
proxyImage := "envoyproxy/envoy:v1.33-latest"
proxyImage_ := os.Getenv("INTERNAL_IMAGES_PROXY")
if proxyImage_ != "" {
proxyImage = proxyImage_
}
containers = append(containers, corev1.Container{
Name: "proxy",
Image: proxyImage,
Command: []string{
"envoy",
"--config-path",
"/etc/envoy/envoy.yaml",
},
VolumeMounts: []corev1.VolumeMount{
{
Name: "envoy-config",
MountPath: "/etc/envoy",
},
},
Ports: []corev1.ContainerPort{
{
Name: ContainerPortNameHTTPProxy,
ContainerPort: int32(proxyPort),
Protocol: corev1.ProtocolTCP,
},
{
ContainerPort: int32(9901),
},
},
ReadinessProbe: &corev1.Probe{
InitialDelaySeconds: 5,
TimeoutSeconds: 5,
FailureThreshold: 10,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/ready",
Port: intstr.FromInt(9901),
},
},
},
LivenessProbe: &corev1.Probe{
InitialDelaySeconds: 5,
TimeoutSeconds: 5,
FailureThreshold: 10,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/server_info",
Port: intstr.FromInt(9901),
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: proxyResourcesRequestsCPU,
corev1.ResourceMemory: proxyResourcesRequestsMemory,
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: proxyResourcesLimitsCPU,
corev1.ResourceMemory: proxyResourcesLimitsMemory,
},
},
SecurityContext: securityContext,
})
debuggerImage := "python:3.12-slim"
debuggerImage_ := os.Getenv("INTERNAL_IMAGES_DEBUGGER")
if debuggerImage_ != "" {
......@@ -1796,12 +1615,6 @@ func (r *DynamoComponentDeploymentReconciler) generateService(ctx context.Contex
}
targetPort := intstr.FromString(commonconsts.DynamoContainerPortName)
if opt.isGenericService {
delete(selector, commonconsts.KubeLabelDynamoDeploymentTargetType)
if opt.containsStealingTrafficDebugModeEnabled {
targetPort = intstr.FromString(ContainerPortNameHTTPProxy)
}
}
spec := corev1.ServiceSpec{
Selector: selector,
......@@ -1812,12 +1625,6 @@ func (r *DynamoComponentDeploymentReconciler) generateService(ctx context.Contex
TargetPort: targetPort,
Protocol: corev1.ProtocolTCP,
},
{
Name: ServicePortNameHTTPNonProxy,
Port: int32(ServicePortHTTPNonProxy),
TargetPort: intstr.FromString(commonconsts.DynamoContainerPortName),
Protocol: corev1.ProtocolTCP,
},
},
}
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2022 Atalaya Tech. Inc
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* Modifications Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
*/
package envoy
import (
"bytes"
"strings"
"text/template"
)
const (
EnvoyAdminPort = 9901
)
type CreateEnvoyConfig struct {
ListenPort int
DebugHeaderName string
DebugHeaderValue string
DebugServerAddress string
DebugServerPort int
ProductionServerAddress string
ProductionServerPort int
}
const configTemplate = `
static_resources:
listeners:
- name: listener_0
address:
socket_address:
address: 0.0.0.0
port_value: {{ .Config.ListenPort }}
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
stat_prefix: ingress_http
access_log:
- name: envoy.access_loggers.stdout
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog
http_filters:
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
route_config:
name: local_route
virtual_hosts:
- name: backend
domains: ["*"]
routes:
- match:
prefix: "/"
headers:
- name: "{{ .Config.DebugHeaderName }}"
exact_match: "{{ .Config.DebugHeaderValue }}"
route:
cluster: service_debug
- match:
prefix: "/"
route:
cluster: service_production
clusters:
- name: service_debug
connect_timeout: 0.25s
type: strict_dns
dns_lookup_family: v4_only
lb_policy: round_robin
load_assignment:
cluster_name: service_debug
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: {{ .Config.DebugServerAddress }}
port_value: {{ .Config.DebugServerPort }}
- name: service_production
connect_timeout: 0.25s
type: strict_dns
dns_lookup_family: v4_only
lb_policy: round_robin
load_assignment:
cluster_name: service_production
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: {{ .Config.ProductionServerAddress }}
port_value: {{ .Config.ProductionServerPort }}
admin:
access_log_path: /dev/null
address:
socket_address:
address: 0.0.0.0
port_value: {{ .AdminPort }}
`
func GenerateEnvoyConfigurationContent(config CreateEnvoyConfig) (string, error) {
t := template.Must(template.New("envoy").Parse(configTemplate))
buf := new(bytes.Buffer)
err := t.Execute(buf, map[string]interface{}{
"Config": config,
"AdminPort": EnvoyAdminPort,
})
if err != nil {
return "", err
}
return strings.TrimSpace(buf.String()), nil
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment