Unverified Commit 648740e8 authored by julienmancuso's avatar julienmancuso Committed by GitHub
Browse files

fix: enable GCP deployments (#1474)

parent 281a69e5
...@@ -69,4 +69,7 @@ const ( ...@@ -69,4 +69,7 @@ const (
KubeAnnotationDynamoComponentStorageNS = "nvidia.com/dynamo-storage-namespace" KubeAnnotationDynamoComponentStorageNS = "nvidia.com/dynamo-storage-namespace"
DynamoDeploymentConfigEnvVar = "DYN_DEPLOYMENT_CONFIG" DynamoDeploymentConfigEnvVar = "DYN_DEPLOYMENT_CONFIG"
DockerConfigVolumeName = "docker-config"
DockerConfigVolumeMountPath = "/docker-config/.docker"
) )
...@@ -67,3 +67,10 @@ func getIngressHost(ingressSpec v1alpha1.IngressSpec) string { ...@@ -67,3 +67,10 @@ func getIngressHost(ingressSpec v1alpha1.IngressSpec) string {
func getK8sName(value string) string { func getK8sName(value string) string {
return strings.ReplaceAll(value, ":", "--") return strings.ReplaceAll(value, ":", "--")
} }
func isGoogleRegistry(host string) bool {
return host == "gcr.io" ||
strings.HasSuffix(host, ".gcr.io") ||
strings.HasSuffix(host, ".pkg.dev") ||
strings.HasSuffix(host, ".google.com")
}
...@@ -852,11 +852,15 @@ func (r *DynamoComponentReconciler) generateImageBuilderPodTemplateSpec(ctx cont ...@@ -852,11 +852,15 @@ func (r *DynamoComponentReconciler) generateImageBuilderPodTemplateSpec(ctx cont
Name: "workspace", Name: "workspace",
MountPath: "/workspace", MountPath: "/workspace",
}, },
{
Name: consts.DockerConfigVolumeName,
MountPath: consts.DockerConfigVolumeMountPath,
},
} }
if dockerConfigJSONSecretName != "" { if dockerConfigJSONSecretName != "" {
volumes = append(volumes, corev1.Volume{ volumes = append(volumes, corev1.Volume{
Name: dockerConfigJSONSecretName, Name: consts.DockerConfigVolumeName,
VolumeSource: corev1.VolumeSource{ VolumeSource: corev1.VolumeSource{
Secret: &corev1.SecretVolumeSource{ Secret: &corev1.SecretVolumeSource{
SecretName: dockerConfigJSONSecretName, SecretName: dockerConfigJSONSecretName,
...@@ -869,9 +873,12 @@ func (r *DynamoComponentReconciler) generateImageBuilderPodTemplateSpec(ctx cont ...@@ -869,9 +873,12 @@ func (r *DynamoComponentReconciler) generateImageBuilderPodTemplateSpec(ctx cont
}, },
}, },
}) })
volumeMounts = append(volumeMounts, corev1.VolumeMount{ } else {
Name: dockerConfigJSONSecretName, volumes = append(volumes, corev1.Volume{
MountPath: "/kaniko/.docker/", Name: consts.DockerConfigVolumeName,
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
}) })
} }
...@@ -921,8 +928,6 @@ func (r *DynamoComponentReconciler) generateImageBuilderPodTemplateSpec(ctx cont ...@@ -921,8 +928,6 @@ func (r *DynamoComponentReconciler) generateImageBuilderPodTemplateSpec(ctx cont
buildEngine := getDynamoComponentImageBuildEngine() buildEngine := getDynamoComponentImageBuildEngine()
privileged := buildEngine != DynamoComponentImageBuildEngineBuildkitRootless
dynamoComponentDownloadCommandTemplate, err := template.New("downloadCommand").Parse(` dynamoComponentDownloadCommandTemplate, err := template.New("downloadCommand").Parse(`
set -e set -e
...@@ -943,10 +948,6 @@ echo "Extracting dynamoComponent tar file..." ...@@ -943,10 +948,6 @@ echo "Extracting dynamoComponent tar file..."
tar -xvf /tmp/downloaded.tar tar -xvf /tmp/downloaded.tar
echo "Removing dynamoComponent tar file..." echo "Removing dynamoComponent tar file..."
rm /tmp/downloaded.tar rm /tmp/downloaded.tar
{{if not .Privileged}}
echo "Changing directory permission..."
chown -R 1000:1000 /workspace
{{end}}
echo "Done" echo "Done"
`) `)
...@@ -961,7 +962,6 @@ echo "Done" ...@@ -961,7 +962,6 @@ echo "Done"
"DynamoComponentDownloadURL": dynamoComponentDownloadURL, "DynamoComponentDownloadURL": dynamoComponentDownloadURL,
"DynamoComponentRepositoryName": dynamoComponentRepositoryName, "DynamoComponentRepositoryName": dynamoComponentRepositoryName,
"DynamoComponentVersion": dynamoComponentVersion, "DynamoComponentVersion": dynamoComponentVersion,
"Privileged": privileged,
}) })
if err != nil { if err != nil {
err = errors.Wrap(err, "failed to execute download command template") err = errors.Wrap(err, "failed to execute download command template")
...@@ -1004,6 +1004,44 @@ echo "Done" ...@@ -1004,6 +1004,44 @@ echo "Done"
}, },
} }
if dockerConfigJSONSecretName == "" {
// if no explicit docker config is provided, we need to provide the docker config to the image builder
var ref name.Reference
ref, err = name.ParseReference(imageName)
if err != nil {
err = errors.Wrap(err, "failed to parse reference")
return
}
dockerRegistry := ref.Context().RegistryStr()
if isGoogleRegistry(dockerRegistry) {
// for GCP, we use the google cloud sdk to get the docker config.
initContainers = append(initContainers, corev1.Container{
Name: "gcp-init-docker-config",
Image: "google/cloud-sdk:slim",
Command: []string{
"/bin/bash",
"-c",
fmt.Sprintf(`set -e
gcloud --quiet config get-value account
TOKEN=$(gcloud --quiet auth print-access-token)
cat > %s/config.json <<EOL
{
"auths": {
"%s": {
"auth": "$(echo -n "oauth2accesstoken:${TOKEN}" | base64 -w 0)"
}
}
}
EOL
echo 'Docker config.json created successfully'`, consts.DockerConfigVolumeMountPath, dockerRegistry),
},
Resources: downloaderContainerResources,
EnvFrom: downloaderContainerEnvFrom,
VolumeMounts: volumeMounts,
})
}
}
containers := make([]corev1.Container, 0) containers := make([]corev1.Container, 0)
var globalExtraPodMetadata *dynamoCommon.ExtraPodMetadata var globalExtraPodMetadata *dynamoCommon.ExtraPodMetadata
...@@ -1111,13 +1149,10 @@ echo "Done" ...@@ -1111,13 +1149,10 @@ echo "Done"
Name: "IFS", Name: "IFS",
Value: "''", Value: "''",
}, },
} {
if dockerConfigJSONSecretName != "" {
builderContainerEnvs = append(builderContainerEnvs, corev1.EnvVar{
Name: "DOCKER_CONFIG", Name: "DOCKER_CONFIG",
Value: "/kaniko/.docker/", Value: consts.DockerConfigVolumeMountPath,
}) },
} }
kanikoCacheRepo := os.Getenv("KANIKO_CACHE_REPO") kanikoCacheRepo := os.Getenv("KANIKO_CACHE_REPO")
...@@ -1174,9 +1209,6 @@ echo "Done" ...@@ -1174,9 +1209,6 @@ echo "Done"
if isBuildkit { if isBuildkit {
output := fmt.Sprintf("type=image,name=%s,push=true,registry.insecure=%v", imageName, dockerRegistryInsecure) output := fmt.Sprintf("type=image,name=%s,push=true,registry.insecure=%v", imageName, dockerRegistryInsecure)
buildkitdFlags := []string{} buildkitdFlags := []string{}
if !privileged {
buildkitdFlags = append(buildkitdFlags, "--oci-worker-no-process-sandbox")
}
if isEstargzEnabled() { if isEstargzEnabled() {
buildkitdFlags = append(buildkitdFlags, "--oci-worker-snapshotter=stargz") buildkitdFlags = append(buildkitdFlags, "--oci-worker-snapshotter=stargz")
output += ",oci-mediatypes=true,compression=estargz,force-compression=true" output += ",oci-mediatypes=true,compression=estargz,force-compression=true"
...@@ -1215,23 +1247,6 @@ echo "Done" ...@@ -1215,23 +1247,6 @@ echo "Done"
} }
} }
var builderContainerSecurityContext *corev1.SecurityContext
if buildEngine == DynamoComponentImageBuildEngineBuildkit {
builderContainerSecurityContext = &corev1.SecurityContext{
Privileged: ptr.To(true),
}
} else if buildEngine == DynamoComponentImageBuildEngineBuildkitRootless {
kubeAnnotations["container.apparmor.security.beta.kubernetes.io/builder"] = "unconfined"
builderContainerSecurityContext = &corev1.SecurityContext{
SeccompProfile: &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeUnconfined,
},
RunAsUser: ptr.To(int64(1000)),
RunAsGroup: ptr.To(int64(1000)),
}
}
// add build args to pass via --build-arg // add build args to pass via --build-arg
for _, buildArg := range buildArgs { for _, buildArg := range buildArgs {
quotedBuildArg := unix.SingleQuote.Quote(buildArg) quotedBuildArg := unix.SingleQuote.Quote(buildArg)
...@@ -1261,7 +1276,13 @@ echo "Done" ...@@ -1261,7 +1276,13 @@ echo "Done"
EnvFrom: builderContainerEnvFrom, EnvFrom: builderContainerEnvFrom,
TTY: true, TTY: true,
Stdin: true, Stdin: true,
SecurityContext: builderContainerSecurityContext, }
if buildEngine == DynamoComponentImageBuildEngineKaniko {
// we need to run as root when using kaniko
container.SecurityContext = &corev1.SecurityContext{
RunAsUser: ptr.To(int64(0)),
}
} }
if globalDefaultImageBuilderContainerResources != nil { if globalDefaultImageBuilderContainerResources != nil {
...@@ -1284,6 +1305,11 @@ echo "Done" ...@@ -1284,6 +1305,11 @@ echo "Done"
Volumes: volumes, Volumes: volumes,
InitContainers: initContainers, InitContainers: initContainers,
Containers: containers, Containers: containers,
SecurityContext: &corev1.PodSecurityContext{
RunAsUser: ptr.To(int64(1000)),
RunAsGroup: ptr.To(int64(1000)),
FSGroup: ptr.To(int64(1000)),
},
}, },
} }
......
...@@ -1000,21 +1000,23 @@ func (r *DynamoComponentDeploymentReconciler) createOrUpdateOrDeleteServices(ctx ...@@ -1000,21 +1000,23 @@ func (r *DynamoComponentDeploymentReconciler) createOrUpdateOrDeleteServices(ctx
return return
} }
func (r *DynamoComponentDeploymentReconciler) createOrUpdateOrDeleteIngress(ctx context.Context, opt generateResourceOption) (modified bool, err error) { func (r *DynamoComponentDeploymentReconciler) createOrUpdateOrDeleteIngress(ctx context.Context, opt generateResourceOption) (bool, error) {
modified, _, err = commonController.SyncResource(ctx, r, opt.dynamoComponentDeployment, func(ctx context.Context) (*networkingv1.Ingress, bool, error) { modified, _, err := commonController.SyncResource(ctx, r, opt.dynamoComponentDeployment, func(ctx context.Context) (*networkingv1.Ingress, bool, error) {
return r.generateIngress(ctx, opt) return r.generateIngress(ctx, opt)
}) })
if err != nil { if err != nil {
return return false, err
} }
modified_, _, err := commonController.SyncResource(ctx, r, opt.dynamoComponentDeployment, func(ctx context.Context) (*networkingv1beta1.VirtualService, bool, error) { if r.UseVirtualService {
return r.generateVirtualService(ctx, opt) modified_, _, err := commonController.SyncResource(ctx, r, opt.dynamoComponentDeployment, func(ctx context.Context) (*networkingv1beta1.VirtualService, bool, error) {
}) return r.generateVirtualService(ctx, opt)
if err != nil { })
return if err != nil {
return false, err
}
return modified || modified_, nil
} }
modified = modified || modified_ return modified, nil
return
} }
func (r *DynamoComponentDeploymentReconciler) generateIngress(ctx context.Context, opt generateResourceOption) (*networkingv1.Ingress, bool, error) { func (r *DynamoComponentDeploymentReconciler) generateIngress(ctx context.Context, opt generateResourceOption) (*networkingv1.Ingress, bool, error) {
......
...@@ -100,6 +100,12 @@ kubectl get storageclass ...@@ -100,6 +100,12 @@ kubectl get storageclass
# standard (default) kubernetes.io/gce-pd Delete Immediate true 1d # standard (default) kubernetes.io/gce-pd Delete Immediate true 1d
``` ```
### Cloud Provider-Specific deployment
#### Google Kubernetes Engine (GKE) deployment
You can find detailed instructions for deployment in GKE [here](../dynamo_deploy/gke_setup.md)
### Installation ### Installation
1. Set the required environment variables: 1. Set the required environment variables:
......
# GKE Workload Identity and Artifact Registry Setup Guide
This guide explains how to set up Workload Identity in GKE and configure access to Google Artifact Registry.
## Prerequisites
- Google Cloud SDK installed
- Access to a GKE cluster
- Required permissions to create and manage service accounts
## Project Setup
Set your project:
```bash
export NAMESPACE=your-k8s-namespace
export RELEASE=your-helm-release-name
export PROJECT=$(gcloud config get-value project)
# set the cluster related info (you can list cluster using gcloud container clusters list)
export CLUSTER_NAME=your-cluster-name
export CLUSTER_REGION=$(gcloud container clusters list --filter="name=${CLUSTER_NAME}" --format="get(location)")
gcloud config set project ${PROJECT}
# Retrieve the Workload Identifier Namespace associated with your cluster:
export CLUSTER_WIN=$(gcloud container clusters describe ${CLUSTER_NAME} \
--region=${CLUSTER_REGION} \
--format="value(workloadIdentityConfig.workloadPool)")
```
```{important}
Make sure Workload Identity is enabled in your cluster!
```
## Service Account Creation and Configuration
1. Create a service account for Workload Identity:
Go to the GCP console and create a new service account (or reuse an existing one)
```bash
gcloud iam service-accounts create workload-identity-sa\
--display-name="workload identity service account" \
--description="Service account to use for Workload Identity in GKE"
export SA=workload-identity-sa@${PROJECT}.iam.gserviceaccount.com
```
2. Configure Workload Identity bindings for Kubernetes service accounts:
```bash
gcloud iam service-accounts add-iam-policy-binding \
${SA} \
--role roles/iam.workloadIdentityUser \
--member "serviceAccount:${CLUSTER_WIN}[${NAMESPACE}/${RELEASE}-dynamo-operator-controller-manager]"
gcloud iam service-accounts add-iam-policy-binding \
${SA} \
--role roles/iam.workloadIdentityUser \
--member "serviceAccount:${CLUSTER_WIN}[${NAMESPACE}/${RELEASE}-dynamo-operator-image-builder]"
gcloud iam service-accounts add-iam-policy-binding \
${SA} \
--role roles/iam.workloadIdentityUser \
--member "serviceAccount:${CLUSTER_WIN}[${NAMESPACE}/${RELEASE}-dynamo-operator-component]"
```
## Artifact Registry Access
### Option 1: Project-Level Access
Grant read and write access at the project level:
```bash
# Grant reader role
gcloud projects add-iam-policy-binding ${PROJECT} \
--member="serviceAccount:${SA}" \
--role="roles/artifactregistry.reader"
# Grant writer role
gcloud projects add-iam-policy-binding ${PROJECT} \
--member="serviceAccount:${SA}" \
--role="roles/artifactregistry.writer"
```
### Option 2: Repository-Level Access
Grant access to specific repository:
```bash
gcloud artifacts repositories add-iam-policy-binding your-artifact-repository \
--location=${CLUSTER_REGION} \
--project=${PROJECT} \
--member="serviceAccount:${SA}" \
--role="roles/artifactregistry.reader"
```
## GKE Node Access to Artifact Registry
This is needed to make sure pods can pull images from Artifact Registry without needing to specify an imagePullSecret
### For GKE Autopilot
```bash
# Get project number
export PROJECT_NUMBER=$(gcloud projects describe ${PROJECT} --format='value(projectNumber)')
# Grant access to the default compute service account
gcloud projects add-iam-policy-binding ${PROJECT} \
--member="serviceAccount:${PROJECT_NUMBER}-compute@developer.gserviceaccount.com" \
--role="roles/artifactregistry.reader"
```
### For Standard GKE
```bash
# Get node service account
export NODE_SERVICE_ACCOUNT=$(gcloud container clusters describe ${CLUSTER_NAME} \
--region ${CLUSTER_REGION} \
--format="get(nodeConfig.serviceAccount)")
# Grant access to node service account
gcloud projects add-iam-policy-binding ${PROJECT} \
--member="serviceAccount:${NODE_SERVICE_ACCOUNT}" \
--role="roles/artifactregistry.reader"
```
## Adding annotations to enable Workload Identity
This is an example of values.yaml used to deploy Dynamo Cloud using custom GCP annotations to enable Workload Identity.
```yaml
dynamo-operator:
...
controllerManager:
serviceAccount:
create: true
annotations:
iam.gke.io/gcp-service-account: your-sa@your-gcp-project.iam.gserviceaccount.com
...
dynamo:
dockerRegistry:
useKubernetesSecret: false
server: us-central1-docker.pkg.dev/your-project/your-registry
components:
serviceAccount:
annotations:
iam.gke.io/gcp-service-account: your-sa@your-gcp-project.iam.gserviceaccount.com
imageBuilder:
serviceAccount:
annotations:
iam.gke.io/gcp-service-account: your-sa@your-gcp-project.iam.gserviceaccount.com
...
....
```
You can use it during helm installation (last step of deploy.sh)
```bash
helm upgrade --install ${RELEASE} platform/ -f values.yaml --namespace ${NAMESPACE}
```
## Important Notes
1. **Prerequisites for Image Pulling**:
- Workload Identity must be enabled on your GKE cluster
- GKE nodes' service account must have the `artifactregistry.reader` role
2. **Troubleshooting**:
- If pods can't pull images, verify both Workload Identity and node service account configurations
- Check service account annotations on Kubernetes service accounts
- Verify IAM bindings are correctly set up
## References
- [GKE Workload Identity Documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity)
- [Artifact Registry Authentication](https://cloud.google.com/artifact-registry/docs/docker/authentication)
- [IAM Roles for Artifact Registry](https://cloud.google.com/artifact-registry/docs/access-control)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment