training_service.yml 1.35 KB
Newer Older
1
2
3
all:
  logCollection: http

4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
kubeflow:
  maxExecDuration: 15m
  nniManagerIp:
  kubeflowConfig:
    operator: tf-operator
    apiVersion: v1alpha2
    storage: azureStorage
    keyVault:
      vaultName:
      name:
    azureStorage:
      accountName:
      azureShare:
  trial:
    worker:
      replicas: 1
      command:
      gpuNum: 1
      cpuNum: 1
      memoryMB: 8192
      image:
  trainingServicePlatform: kubeflow

QuanluZhang's avatar
QuanluZhang committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
frameworkcontroller:
  maxExecDuration: 15m
  nniManagerIp:
  frameworkcontrollerConfig:
    serviceAccountName: frameworkbarrier
    storage: azureStorage
    keyVault:
      vaultName:
      name:
    azureStorage:
      accountName:
      azureShare:
  trial:
    taskRoles:
      - name: worker
        taskNum: 1
        command:
        gpuNum: 1
        cpuNum: 1
        memoryMB: 8192
        image:
        frameworkAttemptCompletionPolicy:
          minFailedTaskCount: 1
          minSucceededTaskCount: 1
  trainingServicePlatform: frameworkcontroller

53
54
55
local:
  trainingServicePlatform: local
pai:
56
57
  nniManagerIp:
  maxExecDuration: 15m
58
  paiYarnConfig:
59
60
61
    host:
    passWord:
    userName:
62
  trainingServicePlatform: paiYarn
63
  trial:
64
65
66
67
68
69
    gpuNum: 1
    cpuNum: 1
    dataDir:
    image:
    memoryMB: 8192
    outputDir:
70
    virtualCluster:
71
72
73
74
75
76
77
remote:
  machineList:
  - ip:
    passwd:
    port:
    username:
  trainingServicePlatform: remote