api-service.yaml 3.32 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: fault-injection-api
  namespace: fault-injection-system

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: fault-injection-api
rules:
- apiGroups: [""]
  resources: ["nodes", "pods", "services"]
  verbs: ["get", "list", "watch", "patch"]
- apiGroups: ["apps"]
  resources: ["deployments", "daemonsets", "statefulsets"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["networking.k8s.io"]
  resources: ["networkpolicies"]
  verbs: ["get", "list", "create", "delete"]
- apiGroups: ["chaos-mesh.org"]
  resources: ["networkchaos", "podchaos", "stresschaos", "iochaos"]
  verbs: ["get", "list", "create", "delete", "watch"]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: fault-injection-api
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: fault-injection-api
subjects:
- kind: ServiceAccount
  name: fault-injection-api
  namespace: fault-injection-system

---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: fault-injection-api
  namespace: fault-injection-system
  labels:
    app: fault-injection-api
spec:
  replicas: 1
  selector:
    matchLabels:
      app: fault-injection-api
  template:
    metadata:
      labels:
        app: fault-injection-api
    spec:
      serviceAccountName: fault-injection-api
      # Use host network to communicate with hostNetwork agents
      hostNetwork: true
      dnsPolicy: ClusterFirstWithHostNet
      # Tolerate GPU node taints
      tolerations:
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              # Require GPU nodes (A100 pools)
              - key: nvidia.com/gpu.present
                operator: In
                values:
                - "true"
              # Prefer stable instance types
              - key: node.kubernetes.io/instance-type
                operator: In
                values:
                - Standard_ND96amsr_A100_v4
      containers:
      - name: api
        # Replace with your Azure Container Registry (ACR)
        image: dynamoci.azurecr.io/fault-injection-api:latest
        imagePullPolicy: Always
        ports:
        - name: http
          containerPort: 8080
          protocol: TCP
        env:
        - name: PYTHONUNBUFFERED
          value: "1"
        livenessProbe:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 10
          periodSeconds: 30
        readinessProbe:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 5
          periodSeconds: 10
        resources:
          requests:
            memory: "256Mi"
            cpu: "100m"
          limits:
            memory: "1Gi"
            cpu: "500m"

---
apiVersion: v1
kind: Service
metadata:
  name: fault-injection-api
  namespace: fault-injection-system
  labels:
    app: fault-injection-api
spec:
  type: ClusterIP
  ports:
  - name: http
    port: 8080
    targetPort: 8080
    protocol: TCP
  selector:
    app: fault-injection-api