"test/ref/leaky_relu.cpp" did not exist on "500d9441143423916fc190c614a12293b9418d7d"
training_service.yml 2.29 KB
Newer Older
1
2
3
all:
  logCollection: http

4
5
6
kubeflow:
  maxExecDuration: 15m
  nniManagerIp:
chicm-ms's avatar
chicm-ms committed
7
8
9
10
  # use a small trial number to make IT faster
  maxTrialNum: 2
  trialConcurrency: 2

11
12
  kubeflowConfig:
    operator: tf-operator
SparkSnail's avatar
SparkSnail committed
13
    apiVersion: v1
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
    storage: azureStorage
    keyVault:
      vaultName:
      name:
    azureStorage:
      accountName:
      azureShare:
  trial:
    worker:
      replicas: 1
      command:
      gpuNum: 1
      cpuNum: 1
      memoryMB: 8192
      image:
  trainingServicePlatform: kubeflow

31
32
33
frameworkcontroller:
  maxExecDuration: 15m
  nniManagerIp:
chicm-ms's avatar
chicm-ms committed
34
35
36
  # use a small trial number to make IT faster
  maxTrialNum: 2
  trialConcurrency: 2
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
  frameworkcontrollerConfig:
    serviceAccountName: frameworkbarrier
    storage: azureStorage
    keyVault:
      vaultName:
      name:
    azureStorage:
      accountName:
      azureShare:
  trial:
    taskRoles:
      - name: worker
        taskNum: 1
        command:
        gpuNum: 1
        cpuNum: 1
        memoryMB: 8192
        image:
        frameworkAttemptCompletionPolicy:
          minFailedTaskCount: 1
          minSucceededTaskCount: 1
  trainingServicePlatform: frameworkcontroller

60
61
local:
  trainingServicePlatform: local
SparkSnail's avatar
SparkSnail committed
62
paiYarn:
63
64
  nniManagerIp:
  maxExecDuration: 15m
65
  paiYarnConfig:
66
67
68
    host:
    passWord:
    userName:
69
  trainingServicePlatform: paiYarn
70
  trial:
71
72
73
74
75
76
    gpuNum: 1
    cpuNum: 1
    dataDir:
    image:
    memoryMB: 8192
    outputDir:
77
    virtualCluster:
SparkSnail's avatar
SparkSnail committed
78
79
80
pai:
  nniManagerIp:
  maxExecDuration: 15m
81
82
83
  # PAI has job submission limitation, set maxTrialNum=1 to control trial job numbers for PAI 
  maxTrialNum: 1
  trialConcurrency: 1
SparkSnail's avatar
SparkSnail committed
84
85
86
87
88
  paiConfig:
    host:
    userName:
  trainingServicePlatform: pai
  trial:
89
    gpuNum: 1 
SparkSnail's avatar
SparkSnail committed
90
91
92
    cpuNum: 1
    image:
    memoryMB: 8192
SparkSnail's avatar
SparkSnail committed
93
    virtualCluster: default
SparkSnail's avatar
SparkSnail committed
94
95
    nniManagerNFSMountPath: 
    containerNFSMountPath: 
96
    paiStorageConfigName: 
97
remote:
SparkSnail's avatar
SparkSnail committed
98
99
  remoteConfig:
    reuse: false
100
101
102
103
104
105
  machineList:
  - ip:
    passwd:
    port:
    username:
  trainingServicePlatform: remote
SparkSnail's avatar
SparkSnail committed
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
adl:
  maxExecDuration: 15m
  nniManagerIp:
  # use a small trial number to make IT faster
  maxTrialNum: 2
  trialConcurrency: 2
  trial:
    namespace: default
    command:
    codeDir:
    gpuNum: 1
    cpuNum: 1
    image:
    memorySize: 1Gi
    checkpoint:
      storageClass:
      storageSize:
  trainingServicePlatform: adl