Unverified Commit 9200ac9b authored by alexanderbilk's avatar alexanderbilk Committed by GitHub
Browse files

feat: Added NIXL Telemetry prometheus port (#5567)


Signed-off-by: default avatarAleksandr Bilkovskii <alexanderb@nvidia.com>
parent 4ede59a2
...@@ -57,6 +57,9 @@ spec: ...@@ -57,6 +57,9 @@ spec:
- interval: 5s - interval: 5s
path: /metrics path: /metrics
port: system port: system
- interval: 5s
path: /metrics
port: nixl
selector: selector:
matchLabels: matchLabels:
nvidia.com/dynamo-component-type: worker nvidia.com/dynamo-component-type: worker
......
...@@ -185,6 +185,9 @@ The operator automatically injects environment variables based on component type ...@@ -185,6 +185,9 @@ The operator automatically injects environment variables based on component type
- **`DYN_SYSTEM_PORT`**: `9090` (automatically enables the system metrics server) - **`DYN_SYSTEM_PORT`**: `9090` (automatically enables the system metrics server)
- **`DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS`**: `["generate"]` - **`DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS`**: `["generate"]`
- **`DYN_SYSTEM_ENABLED`**: `true` (needed for runtime images 0.6.1 and older) - **`DYN_SYSTEM_ENABLED`**: `true` (needed for runtime images 0.6.1 and older)
- **`NIXL_TELEMETRY_PROMETHEUS_PORT`**: `19090`
- **`NIXL_TELEMETRY_EXPORTER`**: `prometheus`
- **`NIXL_TELEMETRY_ENABLE`**: `n` (by default NIXL telemetry is disabled)
### Planner Components ### Planner Components
......
...@@ -24,6 +24,9 @@ const ( ...@@ -24,6 +24,9 @@ const (
EPPGRPCPort = 9002 EPPGRPCPort = 9002
EPPGRPCPortName = "grpc" EPPGRPCPortName = "grpc"
DynamoNixlPort = 19090
DynamoNixlPortName = "nixl"
MpiRunSshPort = 2222 MpiRunSshPort = 2222
// Default security context values // Default security context values
......
...@@ -771,6 +771,9 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -771,6 +771,9 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
{Name: "DYN_SYSTEM_ENABLED", Value: "true"}, {Name: "DYN_SYSTEM_ENABLED", Value: "true"},
{Name: "DYN_SYSTEM_PORT", Value: "9090"}, {Name: "DYN_SYSTEM_PORT", Value: "9090"},
{Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "[\"generate\"]"}, {Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "[\"generate\"]"},
{Name: "NIXL_TELEMETRY_ENABLE", Value: "n"},
{Name: "NIXL_TELEMETRY_EXPORTER", Value: "prometheus"},
{Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090"},
{Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{ {Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{ FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.name", FieldPath: "metadata.name",
...@@ -793,6 +796,9 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -793,6 +796,9 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
{ {
Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoSystemPortName, ContainerPort: commonconsts.DynamoSystemPort, Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoSystemPortName, ContainerPort: commonconsts.DynamoSystemPort,
}, },
{
Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoNixlPortName, ContainerPort: commonconsts.DynamoNixlPort,
},
}, },
VolumeMounts: []corev1.VolumeMount{ VolumeMounts: []corev1.VolumeMount{
{ {
...@@ -906,6 +912,9 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -906,6 +912,9 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
{Name: "DYN_SYSTEM_ENABLED", Value: "true"}, {Name: "DYN_SYSTEM_ENABLED", Value: "true"},
{Name: "DYN_SYSTEM_PORT", Value: "9090"}, {Name: "DYN_SYSTEM_PORT", Value: "9090"},
{Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "[\"generate\"]"}, {Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "[\"generate\"]"},
{Name: "NIXL_TELEMETRY_ENABLE", Value: "n"},
{Name: "NIXL_TELEMETRY_EXPORTER", Value: "prometheus"},
{Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090"},
{Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{ {Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{ FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.name", FieldPath: "metadata.name",
...@@ -928,6 +937,9 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -928,6 +937,9 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
{ {
Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoSystemPortName, ContainerPort: commonconsts.DynamoSystemPort, Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoSystemPortName, ContainerPort: commonconsts.DynamoSystemPort,
}, },
{
Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoNixlPortName, ContainerPort: commonconsts.DynamoNixlPort,
},
}, },
VolumeMounts: []corev1.VolumeMount{ VolumeMounts: []corev1.VolumeMount{
{ {
......
...@@ -32,6 +32,11 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont ...@@ -32,6 +32,11 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont
Name: commonconsts.DynamoSystemPortName, Name: commonconsts.DynamoSystemPortName,
ContainerPort: int32(commonconsts.DynamoSystemPort), ContainerPort: int32(commonconsts.DynamoSystemPort),
}, },
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoNixlPortName,
ContainerPort: int32(commonconsts.DynamoNixlPort),
},
} }
container.LivenessProbe = &corev1.Probe{ container.LivenessProbe = &corev1.Probe{
...@@ -90,6 +95,18 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont ...@@ -90,6 +95,18 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont
Name: "DYN_HEALTH_CHECK_ENABLED", Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "false", Value: "false",
}, },
{
Name: "NIXL_TELEMETRY_ENABLE",
Value: "n",
},
{
Name: "NIXL_TELEMETRY_EXPORTER",
Value: "prometheus",
},
{
Name: "NIXL_TELEMETRY_PROMETHEUS_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoNixlPort),
},
}...) }...)
return container, nil return container, nil
......
...@@ -2066,6 +2066,11 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2066,6 +2066,11 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoSystemPortName, Name: commonconsts.DynamoSystemPortName,
ContainerPort: int32(commonconsts.DynamoSystemPort), ContainerPort: int32(commonconsts.DynamoSystemPort),
}, },
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoNixlPortName,
ContainerPort: int32(commonconsts.DynamoNixlPort),
},
}, },
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
{ {
...@@ -2112,6 +2117,18 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2112,6 +2117,18 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: "DYN_HEALTH_CHECK_ENABLED", Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "false", Value: "false",
}, },
{
Name: "NIXL_TELEMETRY_ENABLE",
Value: "n",
},
{
Name: "NIXL_TELEMETRY_EXPORTER",
Value: "prometheus",
},
{
Name: "NIXL_TELEMETRY_PROMETHEUS_PORT",
Value: "19090",
},
{ {
Name: "DYN_PARENT_DGD_K8S_NAME", Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment", Value: "test-dynamo-graph-deployment",
...@@ -2259,6 +2276,11 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2259,6 +2276,11 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoSystemPortName, Name: commonconsts.DynamoSystemPortName,
ContainerPort: int32(commonconsts.DynamoSystemPort), ContainerPort: int32(commonconsts.DynamoSystemPort),
}, },
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoNixlPortName,
ContainerPort: int32(commonconsts.DynamoNixlPort),
},
}, },
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
{ {
...@@ -2305,6 +2327,18 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2305,6 +2327,18 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: "DYN_HEALTH_CHECK_ENABLED", Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "false", Value: "false",
}, },
{
Name: "NIXL_TELEMETRY_ENABLE",
Value: "n",
},
{
Name: "NIXL_TELEMETRY_EXPORTER",
Value: "prometheus",
},
{
Name: "NIXL_TELEMETRY_PROMETHEUS_PORT",
Value: "19090",
},
{ {
Name: "DYN_PARENT_DGD_K8S_NAME", Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment", Value: "test-dynamo-graph-deployment",
...@@ -3023,6 +3057,11 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -3023,6 +3057,11 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoSystemPortName, Name: commonconsts.DynamoSystemPortName,
ContainerPort: int32(commonconsts.DynamoSystemPort), ContainerPort: int32(commonconsts.DynamoSystemPort),
}, },
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoNixlPortName,
ContainerPort: int32(commonconsts.DynamoNixlPort),
},
}, },
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
{ {
...@@ -3069,6 +3108,18 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -3069,6 +3108,18 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: "DYN_HEALTH_CHECK_ENABLED", Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "false", Value: "false",
}, },
{
Name: "NIXL_TELEMETRY_ENABLE",
Value: "n",
},
{
Name: "NIXL_TELEMETRY_EXPORTER",
Value: "prometheus",
},
{
Name: "NIXL_TELEMETRY_PROMETHEUS_PORT",
Value: "19090",
},
{ {
Name: "DYN_PARENT_DGD_K8S_NAME", Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment", Value: "test-dynamo-graph-deployment",
...@@ -3203,6 +3254,11 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -3203,6 +3254,11 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoSystemPortName, Name: commonconsts.DynamoSystemPortName,
ContainerPort: int32(commonconsts.DynamoSystemPort), ContainerPort: int32(commonconsts.DynamoSystemPort),
}, },
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoNixlPortName,
ContainerPort: int32(commonconsts.DynamoNixlPort),
},
}, },
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
{ {
...@@ -3249,6 +3305,18 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -3249,6 +3305,18 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: "DYN_HEALTH_CHECK_ENABLED", Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "false", Value: "false",
}, },
{
Name: "NIXL_TELEMETRY_ENABLE",
Value: "n",
},
{
Name: "NIXL_TELEMETRY_EXPORTER",
Value: "prometheus",
},
{
Name: "NIXL_TELEMETRY_PROMETHEUS_PORT",
Value: "19090",
},
{ {
Name: "DYN_PARENT_DGD_K8S_NAME", Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment", Value: "test-dynamo-graph-deployment",
...@@ -5269,6 +5337,9 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) { ...@@ -5269,6 +5337,9 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
{Name: "DYN_SYSTEM_ENABLED", Value: "true"}, {Name: "DYN_SYSTEM_ENABLED", Value: "true"},
{Name: "DYN_SYSTEM_PORT", Value: "9090"}, {Name: "DYN_SYSTEM_PORT", Value: "9090"},
{Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "[\"generate\"]"}, {Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "[\"generate\"]"},
{Name: "NIXL_TELEMETRY_ENABLE", Value: "n"},
{Name: "NIXL_TELEMETRY_EXPORTER", Value: "prometheus"},
{Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090"},
{Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{ {Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{ FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.name", FieldPath: "metadata.name",
...@@ -5330,6 +5401,11 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) { ...@@ -5330,6 +5401,11 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
ContainerPort: int32(commonconsts.DynamoSystemPort), ContainerPort: int32(commonconsts.DynamoSystemPort),
Protocol: corev1.ProtocolTCP, Protocol: corev1.ProtocolTCP,
}, },
{
Name: commonconsts.DynamoNixlPortName,
ContainerPort: int32(commonconsts.DynamoNixlPort),
Protocol: corev1.ProtocolTCP,
},
}, },
}, },
}, },
......
...@@ -1306,6 +1306,9 @@ The operator automatically injects environment variables based on component type ...@@ -1306,6 +1306,9 @@ The operator automatically injects environment variables based on component type
- **`DYN_SYSTEM_PORT`**: `9090` (automatically enables the system metrics server) - **`DYN_SYSTEM_PORT`**: `9090` (automatically enables the system metrics server)
- **`DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS`**: `["generate"]` - **`DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS`**: `["generate"]`
- **`DYN_SYSTEM_ENABLED`**: `true` (needed for runtime images 0.6.1 and older) - **`DYN_SYSTEM_ENABLED`**: `true` (needed for runtime images 0.6.1 and older)
- **`NIXL_TELEMETRY_PROMETHEUS_PORT`**: `19090`
- **`NIXL_TELEMETRY_EXPORTER`**: `prometheus`
- **`NIXL_TELEMETRY_ENABLE`**: `n` (by default NIXL telemetry is disabled)
### Planner Components ### Planner Components
......
...@@ -110,6 +110,19 @@ For more information about validating the deployment, see the [vLLM README](../. ...@@ -110,6 +110,19 @@ For more information about validating the deployment, see the [vLLM README](../.
## Set Up Metrics Collection ## Set Up Metrics Collection
### Enable NIXL Telemetry (Optional)
To enable NIXL telemetry metrics in addition to Dynamo metrics, set the following environment variables in your worker component:
spec:
services:
YourWorker:
envs:
- name: NIXL_TELEMETRY_ENABLE
value: "y"
NIXL telemetry is disabled by default. When enabled, NIXL metrics will be exposed on the port specified by `NIXL_TELEMETRY_PROMETHEUS_PORT` (19090 by default).
### Create PodMonitors ### Create PodMonitors
The Prometheus Operator uses PodMonitor resources to automatically discover and scrape metrics from pods. To enable this discovery, the Dynamo operator automatically creates PodMonitor resource and adds these labels to all pods: The Prometheus Operator uses PodMonitor resources to automatically discover and scrape metrics from pods. To enable this discovery, the Dynamo operator automatically creates PodMonitor resource and adds these labels to all pods:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment