Unverified Commit 09b6ab2f authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

fix(operator): shell-quote mpirun args + add nodeSelector to DGDR profilingConfig (#6248)


Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
parent 6ef20625
......@@ -184,6 +184,13 @@ spec:
required:
- name
type: object
nodeSelector:
additionalProperties:
type: string
description: |-
NodeSelector is a selector which must match a node's labels for the profiling pod to be scheduled on that node.
For example, to schedule on ARM64 nodes, use {"kubernetes.io/arch": "arm64"}.
type: object
outputPVC:
description: |-
OutputPVC is an optional PersistentVolumeClaim name for storing profiling output.
......
......@@ -89,6 +89,11 @@ type ProfilingConfigSpec struct {
// For example, to schedule on GPU nodes, add a toleration for the nvidia.com/gpu taint.
// +kubebuilder:validation:Optional
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
// NodeSelector is a selector which must match a node's labels for the profiling pod to be scheduled on that node.
// For example, to schedule on ARM64 nodes, use {"kubernetes.io/arch": "arm64"}.
// +kubebuilder:validation:Optional
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
}
// DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments.
......
......@@ -1355,6 +1355,13 @@ func (in *ProfilingConfigSpec) DeepCopyInto(out *ProfilingConfigSpec) {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.NodeSelector != nil {
in, out := &in.NodeSelector, &out.NodeSelector
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProfilingConfigSpec.
......
......@@ -184,6 +184,13 @@ spec:
required:
- name
type: object
nodeSelector:
additionalProperties:
type: string
description: |-
NodeSelector is a selector which must match a node's labels for the profiling pod to be scheduled on that node.
For example, to schedule on ARM64 nodes, use {"kubernetes.io/arch": "arm64"}.
type: object
outputPVC:
description: |-
OutputPVC is an optional PersistentVolumeClaim name for storing profiling output.
......
......@@ -1212,6 +1212,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
podSpec.Tolerations = dgdr.Spec.ProfilingConfig.Tolerations
}
// Apply nodeSelector if specified in the DGDR
if len(dgdr.Spec.ProfilingConfig.NodeSelector) > 0 {
podSpec.NodeSelector = dgdr.Spec.ProfilingConfig.NodeSelector
}
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: jobName,
......
......@@ -17,6 +17,24 @@ type TRTLLMBackend struct {
MpiRunSecretName string
}
// shellQuoteForBashC quotes a string for safe use inside a single-quoted bash -c '...' command.
// Since the outer context is already single-quoted, we CANNOT use single quotes here.
// Instead, we use double quotes and escape characters that are special inside double quotes:
// backslash, double quote, dollar sign, and backtick.
// Any embedded single quotes are also escaped (\') since the result lives inside bash -c '...'.
func shellQuoteForBashC(s string) string {
if strings.ContainsAny(s, " \t\n'\"\\{}[]$`!") {
escaped := s
escaped = strings.ReplaceAll(escaped, `\`, `\\`) // must be first
escaped = strings.ReplaceAll(escaped, `"`, `\"`)
escaped = strings.ReplaceAll(escaped, `$`, `\$`)
escaped = strings.ReplaceAll(escaped, "`", "\\`")
escaped = strings.ReplaceAll(escaped, "'", `'"'"'`) // end single-quote, literal ', restart single-quote
return `"` + escaped + `"`
}
return s
}
func (b *TRTLLMBackend) UpdateContainer(container *corev1.Container, numberOfNodes int32, role Role, component *v1alpha1.DynamoComponentDeploymentSharedSpec, serviceName string, multinodeDeployer MultinodeDeployer) {
// Check for volumeMounts with useAsCompilationCache=true
for _, volumeMount := range component.VolumeMounts {
......@@ -111,12 +129,15 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
if len(container.Command) > 0 && isPythonCommand(container.Command[0]) {
// Direct Python command: combine command + args
var parts []string
parts = append(parts, container.Command...)
if len(container.Args) > 0 {
parts = append(parts, container.Args...)
// Shell-quote each part to handle args with spaces (e.g., JSON in --override-engine-args)
var quotedParts []string
for _, part := range container.Command {
quotedParts = append(quotedParts, shellQuoteForBashC(part))
}
for _, part := range container.Args {
quotedParts = append(quotedParts, shellQuoteForBashC(part))
}
originalCommand = strings.Join(parts, " ")
originalCommand = strings.Join(quotedParts, " ")
} else if len(container.Args) > 0 {
// Shell command (sh -c): args contains the full command
originalCommand = strings.Join(container.Args, " ")
......
......@@ -2,6 +2,7 @@ package dynamo
import (
"reflect"
"strings"
"testing"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
......@@ -14,6 +15,105 @@ const (
mpiRunSecretName = "mpi-run-ssh-secret"
)
func TestShellQuoteForBashC(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "Simple flag - no quoting needed",
input: "--model-path",
expected: "--model-path",
},
{
name: "Simple path - no quoting needed",
input: "/usr/bin/python3",
expected: "/usr/bin/python3",
},
{
name: "Empty string - no quoting needed",
input: "",
expected: "",
},
{
name: "JSON override-engine-args (primary use case)",
input: `{"load_format": "DUMMY", "tensor_parallel_size": 8}`,
expected: `"{\"load_format\": \"DUMMY\", \"tensor_parallel_size\": 8}"`,
},
{
name: "String with dollar sign is escaped",
input: "$HOME/path",
expected: `"\$HOME/path"`,
},
{
name: "String with backtick is escaped",
input: "hello `world`",
expected: "\"hello \\`world\\`\"",
},
{
name: "Backslash is escaped first to avoid double-escaping",
input: `path\to\file`,
expected: `"path\\to\\file"`,
},
{
name: "Single quote breaks out of outer bash -c context",
input: "it's",
expected: `"it'"'"'s"`,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := shellQuoteForBashC(tt.input)
if result != tt.expected {
t.Errorf("shellQuoteForBashC(%q) = %q, want %q", tt.input, result, tt.expected)
}
})
}
}
func TestShellQuoteForBashC_InLeaderCommand(t *testing.T) {
// End-to-end: verify that when setupLeaderContainer receives args with JSON
// (the actual --override-engine-args use case), the final command string
// contains properly quoted JSON that will survive bash -c shell parsing.
backend := &TRTLLMBackend{}
jsonArgs := `{"load_format": "DUMMY", "tensor_parallel_size": 8}`
container := &corev1.Container{
Command: []string{"python3", "-m", "dynamo.trtllm"},
Args: []string{"--model-path", "deepseek-ai/DeepSeek-R1", "--override-engine-args", jsonArgs},
}
component := &v1alpha1.DynamoComponentDeploymentSharedSpec{
Resources: &v1alpha1.Resources{
Requests: &v1alpha1.ResourceItem{GPU: "4"},
},
}
backend.setupLeaderContainer(container, 2, "dec", component, &GroveMultinodeDeployer{})
if len(container.Args) != 1 {
t.Fatalf("expected 1 arg, got %d", len(container.Args))
}
cmd := container.Args[0]
// The JSON arg must be double-quoted with escaped inner double quotes
// so it survives the outer bash -c '...' wrapper intact.
expectedQuotedJSON := `"{\"load_format\": \"DUMMY\", \"tensor_parallel_size\": 8}"`
if !strings.Contains(cmd, expectedQuotedJSON) {
t.Errorf("generated command does not contain properly quoted JSON.\nGot: %s\nWant substring: %s", cmd, expectedQuotedJSON)
}
// The --override-engine-args flag itself should appear unquoted (no special chars)
if !strings.Contains(cmd, "--override-engine-args "+expectedQuotedJSON) {
t.Errorf("--override-engine-args and its JSON value should be adjacent in the command.\nGot: %s", cmd)
}
// Verify the command is wrapped in bash -c with trtllm-llmapi-launch
if !strings.Contains(cmd, "bash -c 'trtllm-llmapi-launch") {
t.Errorf("command should contain bash -c 'trtllm-llmapi-launch wrapper.\nGot: %s", cmd)
}
}
func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
tests := []struct {
name string
......
......@@ -873,6 +873,7 @@ _Appears in:_
| `outputPVC` _string_ | OutputPVC is an optional PersistentVolumeClaim name for storing profiling output.<br />If specified, all profiling artifacts (logs, plots, configs, raw data) will be written<br />to this PVC instead of an ephemeral emptyDir volume. This allows users to access<br />complete profiling results after the job completes by mounting the PVC.<br />The PVC must exist in the same namespace as the DGDR.<br />If not specified, profiling uses emptyDir and only essential data is saved to ConfigMaps.<br />Note: ConfigMaps are still created regardless of this setting for planner integration. | | Optional: \{\} <br /> |
| `resources` _[ResourceRequirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourcerequirements-v1-core)_ | Resources specifies the compute resource requirements for the profiling job container.<br />If not specified, no resource requests or limits are set. | | Optional: \{\} <br /> |
| `tolerations` _[Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#toleration-v1-core) array_ | Tolerations allows the profiling job to be scheduled on nodes with matching taints.<br />For example, to schedule on GPU nodes, add a toleration for the nvidia.com/gpu taint. | | Optional: \{\} <br /> |
| `nodeSelector` _object (keys:string, values:string)_ | NodeSelector is a selector which must match a node's labels for the profiling pod to be scheduled on that node.<br />For example, to schedule on ARM64 nodes, use \{"kubernetes.io/arch": "arm64"\}. | | Optional: \{\} <br /> |
#### ResourceItem
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment