training_service.yml 1.94 KB
Newer Older
1
2
3
all:
  logCollection: http

4
5
6
kubeflow:
  maxExecDuration: 15m
  nniManagerIp:
chicm-ms's avatar
chicm-ms committed
7
8
9
10
  # use a small trial number to make IT faster
  maxTrialNum: 2
  trialConcurrency: 2

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
  kubeflowConfig:
    operator: tf-operator
    apiVersion: v1alpha2
    storage: azureStorage
    keyVault:
      vaultName:
      name:
    azureStorage:
      accountName:
      azureShare:
  trial:
    worker:
      replicas: 1
      command:
      gpuNum: 1
      cpuNum: 1
      memoryMB: 8192
      image:
  trainingServicePlatform: kubeflow

31
32
33
frameworkcontroller:
  maxExecDuration: 15m
  nniManagerIp:
chicm-ms's avatar
chicm-ms committed
34
35
36
  # use a small trial number to make IT faster
  maxTrialNum: 2
  trialConcurrency: 2
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
  frameworkcontrollerConfig:
    serviceAccountName: frameworkbarrier
    storage: azureStorage
    keyVault:
      vaultName:
      name:
    azureStorage:
      accountName:
      azureShare:
  trial:
    taskRoles:
      - name: worker
        taskNum: 1
        command:
        gpuNum: 1
        cpuNum: 1
        memoryMB: 8192
        image:
        frameworkAttemptCompletionPolicy:
          minFailedTaskCount: 1
          minSucceededTaskCount: 1
  trainingServicePlatform: frameworkcontroller

60
61
local:
  trainingServicePlatform: local
SparkSnail's avatar
SparkSnail committed
62
paiYarn:
63
64
  nniManagerIp:
  maxExecDuration: 15m
65
  paiYarnConfig:
66
67
68
    host:
    passWord:
    userName:
69
  trainingServicePlatform: paiYarn
70
  trial:
71
72
73
74
75
76
    gpuNum: 1
    cpuNum: 1
    dataDir:
    image:
    memoryMB: 8192
    outputDir:
77
    virtualCluster:
SparkSnail's avatar
SparkSnail committed
78
79
80
pai:
  nniManagerIp:
  maxExecDuration: 15m
81
82
83
  # PAI has job submission limitation, set maxTrialNum=1 to control trial job numbers for PAI 
  maxTrialNum: 1
  trialConcurrency: 1
SparkSnail's avatar
SparkSnail committed
84
85
86
87
88
  paiConfig:
    host:
    userName:
  trainingServicePlatform: pai
  trial:
89
    gpuNum: 1 
SparkSnail's avatar
SparkSnail committed
90
91
92
    cpuNum: 1
    image:
    memoryMB: 8192
SparkSnail's avatar
SparkSnail committed
93
    virtualCluster: default
SparkSnail's avatar
SparkSnail committed
94
95
    nniManagerNFSMountPath: 
    containerNFSMountPath: 
96
    paiStorageConfigName: 
97
98
99
100
101
102
103
remote:
  machineList:
  - ip:
    passwd:
    port:
    username:
  trainingServicePlatform: remote