training_service.yml 817 Bytes
Newer Older
1
2
3
all:
  logCollection: http

4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
kubeflow:
  maxExecDuration: 15m
  nniManagerIp:
  kubeflowConfig:
    operator: tf-operator
    apiVersion: v1alpha2
    storage: azureStorage
    keyVault:
      vaultName:
      name:
    azureStorage:
      accountName:
      azureShare:
  trial:
    worker:
      replicas: 1
      command:
      gpuNum: 1
      cpuNum: 1
      memoryMB: 8192
      image:
  trainingServicePlatform: kubeflow

27
28
29
local:
  trainingServicePlatform: local
pai:
30
31
  nniManagerIp:
  maxExecDuration: 15m
32
  paiYarnConfig:
33
34
35
    host:
    passWord:
    userName:
36
  trainingServicePlatform: paiYarn
37
  trial:
38
39
40
41
42
43
    gpuNum: 1
    cpuNum: 1
    dataDir:
    image:
    memoryMB: 8192
    outputDir:
44
    virtualCluster:
45
46
47
48
49
50
51
remote:
  machineList:
  - ip:
    passwd:
    port:
    username:
  trainingServicePlatform: remote