values.yaml 5.23 KB
Newer Older
raojy's avatar
raojy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# -- Default values for chart vllm
# -- Declare variables to be passed into your templates.

# -- Image configuration
image:
  # -- Image repository
  repository: "vllm/vllm-openai"
  # -- Image tag
  tag: "latest"
  # -- Container launch command
  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]

# -- Container port
containerPort: 8000
# -- Service name
serviceName:
# -- Service port
servicePort: 80
# -- Additional ports configuration
extraPorts: []

# -- Number of replicas
replicaCount: 1

# -- Deployment strategy configuration
deploymentStrategy: {}

# -- Resource configuration
resources:
  requests:
    # -- Number of CPUs
    cpu: 4
    # -- CPU memory configuration
    memory: 16Gi
    # -- Number of gpus used
    nvidia.com/gpu: 1
  limits:
    # -- Number of CPUs
    cpu: 4
    # -- CPU memory configuration
    memory: 16Gi
    # -- Number of gpus used
    nvidia.com/gpu: 1

# -- Type of gpu used
gpuModels:
  - "TYPE_GPU_USED"

# -- Autoscaling configuration
autoscaling:
  # -- Enable autoscaling
  enabled: false
  # -- Minimum replicas
  minReplicas: 1
  # -- Maximum replicas
  maxReplicas: 100
  # -- Target CPU utilization for autoscaling
  targetCPUUtilizationPercentage: 80
  # targetMemoryUtilizationPercentage: 80

# -- Configmap
configs: {}

# -- Secrets configuration
secrets: {}

# -- External configuration
externalConfigs: []

# -- Custom Objects configuration
customObjects: []

# -- Disruption Budget Configuration
maxUnavailablePodDisruptionBudget: ""

# -- Additional configuration for the init container
extraInit:
  # -- Model download functionality (optional)
  modelDownload:
    # -- Enable model download job and wait container
    enabled: true
    # -- Image configuration for model download operations
    image:
      # -- Image repository
      repository: "amazon/aws-cli"
      # -- Image tag
      tag: "2.6.4"
      # -- Image pull policy
      pullPolicy: "IfNotPresent"
    # -- Wait container configuration (init container that waits for model to be ready)
    waitContainer:
      # -- Command to execute
      command: ["/bin/bash"]
      # -- Arguments for the wait container
      args:
        - "-eucx"
        - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
      # -- Environment variables (optional, overrides S3 defaults entirely if specified)
      # env:
      #   - name: HUGGING_FACE_HUB_TOKEN
      #     value: "your-token"
      #   - name: MODEL_ID
      #     value: "meta-llama/Llama-2-7b"
    # -- Download job configuration (job that actually downloads the model)
    downloadJob:
      # -- Command to execute
      command: ["/bin/bash"]
      # -- Arguments for the download job
      args:
        - "-eucx"
        - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
      # -- Environment variables (optional, overrides S3 defaults entirely if specified)
      # env:
      #   - name: HUGGING_FACE_HUB_TOKEN
      #     value: "your-token"
      #   - name: MODEL_ID
      #     value: "meta-llama/Llama-2-7b"

  # -- Custom init containers (appended after wait-download-model if modelDownload is enabled)
  initContainers: []
  # Example for llm-d sidecar:
  # initContainers:
  #   - name: llm-d-routing-proxy
  #     image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
  #     imagePullPolicy: IfNotPresent
  #     ports:
  #       - containerPort: 8080
  #         name: proxy
  #     securityContext:
  #       runAsUser: 1000

  # -- Path of the model on the s3 which hosts model weights and config files
  s3modelpath: "relative_s3_model_path/opt-125m"
  # -- Storage size for the PVC
  pvcStorage: "1Gi"
  # -- Disable AWS EC2 metadata service
  awsEc2MetadataDisabled: true

# -- Additional containers configuration
extraContainers: []

# -- Readiness probe configuration
readinessProbe:
  # -- Number of seconds after the container has started before readiness probe is initiated
  initialDelaySeconds: 5
  # -- How often (in seconds) to perform the readiness probe
  periodSeconds: 5
  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
  failureThreshold: 3
   # -- Configuration of the Kubelet http request on the server
  httpGet:
    # -- Path to access on the HTTP server
    path: /health
    # -- Name or number of the port to access on the container, on which the server is listening
    port: 8000

# -- Liveness probe configuration
livenessProbe:
 # -- Number of seconds after the container has started before liveness probe is initiated
  initialDelaySeconds: 15
  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
  failureThreshold: 3
  # -- How often (in seconds) to perform the liveness probe
  periodSeconds: 10
  # -- Configuration of the Kubelet http request on the server
  httpGet:
    # -- Path to access on the HTTP server
    path: /health
    # -- Name or number of the port to access on the container, on which the server is listening
    port: 8000

labels:
  environment: "test"
  release: "test"