Unverified Commit 436f08df authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: add fsgroup to ProfilingJob (#4298)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 4891e161
...@@ -36,6 +36,7 @@ import ( ...@@ -36,6 +36,7 @@ import (
"k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/yaml" "k8s.io/apimachinery/pkg/util/yaml"
"k8s.io/client-go/tools/record" "k8s.io/client-go/tools/record"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime" ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client"
...@@ -1140,9 +1141,14 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1140,9 +1141,14 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
Template: corev1.PodTemplateSpec{ Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{ Spec: corev1.PodSpec{
ServiceAccountName: ServiceAccountProfilingJob, ServiceAccountName: ServiceAccountProfilingJob,
RestartPolicy: corev1.RestartPolicyNever, RestartPolicy: corev1.RestartPolicyNever, SecurityContext: &corev1.PodSecurityContext{
Containers: []corev1.Container{profilerContainer, sidecarContainer}, RunAsNonRoot: ptr.To(true), // Enforces that container cannot run as root
Volumes: volumes, RunAsUser: ptr.To[int64](1000), // Run as UID 1000 (non-privileged user)
RunAsGroup: ptr.To[int64](1000), // Run with GID 1000 (non-privileged group)
FSGroup: ptr.To[int64](1000), // Volume files owned by GID 1000
},
Containers: []corev1.Container{profilerContainer, sidecarContainer},
Volumes: volumes,
ImagePullSecrets: []corev1.LocalObjectReference{ ImagePullSecrets: []corev1.LocalObjectReference{
{Name: "nvcr-imagepullsecret"}, {Name: "nvcr-imagepullsecret"},
}, },
......
...@@ -1092,6 +1092,75 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -1092,6 +1092,75 @@ var _ = Describe("DGDR Profiler Arguments", func() {
// Clean up // Clean up
_ = k8sClient.Delete(ctx, job) _ = k8sClient.Delete(ctx, job)
}) })
It("Should set fsGroup in pod security context for volume permissions", func() {
ctx := context.Background()
namespace := "default"
dgdrName := "test-fsgroup"
// Create ServiceAccount
sa := &corev1.ServiceAccount{
ObjectMeta: metav1.ObjectMeta{
Name: ServiceAccountProfilingJob,
Namespace: namespace,
},
}
Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, sa) }()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"sla": map[string]interface{}{
"ttft": 50.0,
"itl": 10.0,
"isl": 3000,
"osl": 500,
},
}),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Re-fetch DGDR to get proper metadata from API server
var fetchedDGDR nvidiacomv1alpha1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed())
// Create profiling job with properly initialized DGDR
err := reconciler.createProfilingJob(ctx, &fetchedDGDR)
Expect(err).NotTo(HaveOccurred())
// Verify job was created
jobName := getProfilingJobName(&fetchedDGDR)
job := &batchv1.Job{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed())
// Verify security context has all security fields set correctly
podSecurityContext := job.Spec.Template.Spec.SecurityContext
Expect(podSecurityContext).NotTo(BeNil())
Expect(podSecurityContext.RunAsNonRoot).NotTo(BeNil())
Expect(*podSecurityContext.RunAsNonRoot).To(BeTrue())
Expect(podSecurityContext.RunAsUser).NotTo(BeNil())
Expect(*podSecurityContext.RunAsUser).To(Equal(int64(1000)))
Expect(podSecurityContext.RunAsGroup).NotTo(BeNil())
Expect(*podSecurityContext.RunAsGroup).To(Equal(int64(1000)))
Expect(podSecurityContext.FSGroup).NotTo(BeNil())
Expect(*podSecurityContext.FSGroup).To(Equal(int64(1000)))
// Clean up
_ = k8sClient.Delete(ctx, job)
})
}) })
}) })
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment