training_service_v2.yml 1.29 KB
Newer Older
1
2
3
4
5
6
7
8
hybrid:
  trainingService:
  - platform: remote
    machineList:
      - host: 
        user: 
        password: 
        port: 
9
10
11
12
13
14
  - platform: local
  - platform: aml
    subscriptionId:
    resourceGroup:
    workspaceName:
    computeTarget:
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
kubeflow:
  trialGpuNumber: 0
  trialConcurrency: 2
  maxTrialNumber: 2
  nniManagerIp:
  trainingService:
    reuseMode: true
    platform: kubeflow
    worker:
      command:
      code_directory:
      dockerImage:
      cpuNumber: 1
      gpuNumber: 0
      memorySize: 8192
      replicas: 1
    operator: tf-operator
    storage:
      storageType: azureStorage
      azureAccount:
      azureShare:
      keyVaultName:
      keyVaultKey:
    apiVersion: v1
frameworkcontroller:
  trialGpuNumber: 0
  trialConcurrency: 2
  maxTrialNumber: 2
  nniManagerIp:
  trainingService:
    reuseMode: true
    platform: frameworkcontroller
    serviceAccountName: frameworkcontroller
    taskRoles:
      - name: worker
        dockerImage: 
        taskNumber: 1
        command:
        gpuNumber: 0
        cpuNumber: 1
        memorySize: 8192
        framework_attempt_completion_policy:
          min_failed_task_count: 1
          minSucceedTaskCount: 1
    storage:
      storageType: azureStorage
      azureAccount:
      azureShare:
      keyVaultName:
      keyVaultKey: