training_service.yml 1.35 KB
Newer Older
1
2
3
all:
  logCollection: http

4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
kubeflow:
  maxExecDuration: 15m
  nniManagerIp:
  kubeflowConfig:
    operator: tf-operator
    apiVersion: v1alpha2
    storage: azureStorage
    keyVault:
      vaultName:
      name:
    azureStorage:
      accountName:
      azureShare:
  trial:
    worker:
      replicas: 1
      command:
      gpuNum: 1
      cpuNum: 1
      memoryMB: 8192
      image:
  trainingServicePlatform: kubeflow

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
frameworkcontroller:
  maxExecDuration: 15m
  nniManagerIp:
  frameworkcontrollerConfig:
    serviceAccountName: frameworkbarrier
    storage: azureStorage
    keyVault:
      vaultName:
      name:
    azureStorage:
      accountName:
      azureShare:
  trial:
    taskRoles:
      - name: worker
        taskNum: 1
        command:
        gpuNum: 1
        cpuNum: 1
        memoryMB: 8192
        image:
        frameworkAttemptCompletionPolicy:
          minFailedTaskCount: 1
          minSucceededTaskCount: 1
  trainingServicePlatform: frameworkcontroller

53
54
55
local:
  trainingServicePlatform: local
pai:
56
57
  nniManagerIp:
  maxExecDuration: 15m
58
  paiYarnConfig:
59
60
61
    host:
    passWord:
    userName:
62
  trainingServicePlatform: paiYarn
63
  trial:
64
65
66
67
68
69
    gpuNum: 1
    cpuNum: 1
    dataDir:
    image:
    memoryMB: 8192
    outputDir:
70
    virtualCluster:
71
72
73
74
75
76
77
remote:
  machineList:
  - ip:
    passwd:
    port:
    username:
  trainingServicePlatform: remote