deployment.yaml 1002 Bytes
Newer Older
Olivier Dehaene's avatar
Olivier Dehaene committed
1
2
3
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: bloom-deployment
endpoint_name: bloom-inference
4
model: azureml:bloom-safetensors:1
Olivier Dehaene's avatar
Olivier Dehaene committed
5
6
model_mount_path: /var/azureml-model
environment_variables:
7
  WEIGHTS_CACHE_OVERRIDE: /var/azureml-model/bloom-safetensors
8
  MODEL_ID: bigscience/bloom
9
  NUM_SHARD: 8
Olivier Dehaene's avatar
Olivier Dehaene committed
10
environment:
11
  image: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference:0.2.0
Olivier Dehaene's avatar
Olivier Dehaene committed
12
13
  inference_config:
    liveness_route:
14
      port: 80
Olivier Dehaene's avatar
Olivier Dehaene committed
15
16
      path: /health
    readiness_route:
17
      port: 80
Olivier Dehaene's avatar
Olivier Dehaene committed
18
19
      path: /health
    scoring_route:
20
      port: 80
Olivier Dehaene's avatar
Olivier Dehaene committed
21
22
23
24
      path: /generate
instance_type: Standard_ND96amsr_A100_v4
request_settings:
  request_timeout_ms: 90000
Olivier Dehaene's avatar
Olivier Dehaene committed
25
  max_concurrent_requests_per_instance: 256
Olivier Dehaene's avatar
Olivier Dehaene committed
26
liveness_probe:
Olivier Dehaene's avatar
Olivier Dehaene committed
27
  initial_delay: 600
28
  timeout: 90
Olivier Dehaene's avatar
Olivier Dehaene committed
29
  period: 120
Olivier Dehaene's avatar
Olivier Dehaene committed
30
  success_threshold: 1
31
  failure_threshold: 5
Olivier Dehaene's avatar
Olivier Dehaene committed
32
readiness_probe:
Olivier Dehaene's avatar
Olivier Dehaene committed
33
  initial_delay: 600
34
  timeout: 90
Olivier Dehaene's avatar
Olivier Dehaene committed
35
  period: 120
Olivier Dehaene's avatar
Olivier Dehaene committed
36
  success_threshold: 1
37
  failure_threshold: 5
Olivier Dehaene's avatar
Olivier Dehaene committed
38
instance_count: 1