training_service.yml 2.8 KB
Newer Older
1
2
3
all:
  logCollection: http

4
5
6
kubeflow:
  maxExecDuration: 15m
  nniManagerIp:
chicm-ms's avatar
chicm-ms committed
7
8
9
10
  # use a small trial number to make IT faster
  maxTrialNum: 2
  trialConcurrency: 2

11
12
  kubeflowConfig:
    operator: tf-operator
SparkSnail's avatar
SparkSnail committed
13
    apiVersion: v1
14
15
16
17
18
19
20
    storage: azureStorage
    keyVault:
      vaultName:
      name:
    azureStorage:
      accountName:
      azureShare:
J-shang's avatar
J-shang committed
21
    namespace: kubeflow
22
23
24
25
26
27
28
29
30
31
  trial:
    worker:
      replicas: 1
      command:
      gpuNum: 1
      cpuNum: 1
      memoryMB: 8192
      image:
  trainingServicePlatform: kubeflow

32
33
34
frameworkcontroller:
  maxExecDuration: 15m
  nniManagerIp:
chicm-ms's avatar
chicm-ms committed
35
36
37
  # use a small trial number to make IT faster
  maxTrialNum: 2
  trialConcurrency: 2
38
  frameworkcontrollerConfig:
J-shang's avatar
J-shang committed
39
    serviceAccountName: frameworkcontroller
40
41
42
43
44
45
46
    storage: azureStorage
    keyVault:
      vaultName:
      name:
    azureStorage:
      accountName:
      azureShare:
J-shang's avatar
J-shang committed
47
    namespace: kubeflow
48
49
50
51
52
53
54
55
56
57
58
59
60
61
  trial:
    taskRoles:
      - name: worker
        taskNum: 1
        command:
        gpuNum: 1
        cpuNum: 1
        memoryMB: 8192
        image:
        frameworkAttemptCompletionPolicy:
          minFailedTaskCount: 1
          minSucceededTaskCount: 1
  trainingServicePlatform: frameworkcontroller

62
63
local:
  trainingServicePlatform: local
SparkSnail's avatar
SparkSnail committed
64
65
66
pai:
  nniManagerIp:
  maxExecDuration: 15m
67
68
69
  # PAI has job submission limitation, set maxTrialNum=1 to control trial job numbers for PAI 
  maxTrialNum: 1
  trialConcurrency: 1
SparkSnail's avatar
SparkSnail committed
70
71
72
73
74
  paiConfig:
    host:
    userName:
  trainingServicePlatform: pai
  trial:
75
    gpuNum: 1 
SparkSnail's avatar
SparkSnail committed
76
77
78
    cpuNum: 1
    image:
    memoryMB: 8192
SparkSnail's avatar
SparkSnail committed
79
    virtualCluster: default
SparkSnail's avatar
SparkSnail committed
80
81
    nniManagerNFSMountPath: 
    containerNFSMountPath: 
82
    paiStorageConfigName: 
83
remote:
SparkSnail's avatar
SparkSnail committed
84
85
  remoteConfig:
    reuse: false
86
87
88
89
90
91
  machineList:
  - ip:
    passwd:
    port:
    username:
  trainingServicePlatform: remote
92
93
94
  sharedStorage:
    storageAccountKey:
    nfsServer:
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
hybrid:
  maxExecDuration: 15m
  nniManagerIp:
  maxTrialNum: 2
  trialConcurrency: 2
  trial:
    gpuNum: 0
  trainingServicePlatform: hybrid
  hybridConfig:
    # TODO: Add more platforms
    trainingServicePlatforms:
      - remote
      - local
  machineList:
  - ip:
    passwd:
    port:
    username:
  remoteConfig:
    reuse: true
SparkSnail's avatar
SparkSnail committed
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
adl:
  maxExecDuration: 15m
  nniManagerIp:
  # use a small trial number to make IT faster
  maxTrialNum: 2
  trialConcurrency: 2
  trial:
    namespace: default
    command:
    codeDir:
    gpuNum: 1
    cpuNum: 1
    image:
    memorySize: 1Gi
    checkpoint:
      storageClass:
      storageSize:
  trainingServicePlatform: adl
SparkSnail's avatar
SparkSnail committed
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
aml:
  nniManagerIp:
  maxExecDuration: 15m
  # PAI has job submission limitation, set maxTrialNum=1 to control trial job numbers for PAI 
  maxTrialNum: 2
  trialConcurrency: 2
  trainingServicePlatform: aml
  trial:
    gpuNum: 1
    image:
  amlConfig:
    subscriptionId:
    resourceGroup:
    workspaceName:
    computeTarget: