all: logCollection: http kubeflow: maxExecDuration: 15m nniManagerIp: # use a small trial number to make IT faster maxTrialNum: 2 trialConcurrency: 2 kubeflowConfig: operator: tf-operator apiVersion: v1 storage: azureStorage keyVault: vaultName: name: azureStorage: accountName: azureShare: trial: worker: replicas: 1 command: gpuNum: 1 cpuNum: 1 memoryMB: 8192 image: trainingServicePlatform: kubeflow frameworkcontroller: maxExecDuration: 15m nniManagerIp: # use a small trial number to make IT faster maxTrialNum: 2 trialConcurrency: 2 frameworkcontrollerConfig: serviceAccountName: frameworkbarrier storage: azureStorage keyVault: vaultName: name: azureStorage: accountName: azureShare: trial: taskRoles: - name: worker taskNum: 1 command: gpuNum: 1 cpuNum: 1 memoryMB: 8192 image: frameworkAttemptCompletionPolicy: minFailedTaskCount: 1 minSucceededTaskCount: 1 trainingServicePlatform: frameworkcontroller local: trainingServicePlatform: local pai: nniManagerIp: maxExecDuration: 15m # PAI has job submission limitation, set maxTrialNum=1 to control trial job numbers for PAI maxTrialNum: 1 trialConcurrency: 1 paiConfig: host: userName: trainingServicePlatform: pai trial: gpuNum: 1 cpuNum: 1 image: memoryMB: 8192 virtualCluster: default nniManagerNFSMountPath: containerNFSMountPath: paiStorageConfigName: remote: remoteConfig: reuse: false machineList: - ip: passwd: port: username: trainingServicePlatform: remote sharedStorage: storageAccountKey: nfsServer: hybrid: maxExecDuration: 15m nniManagerIp: maxTrialNum: 2 trialConcurrency: 2 trial: gpuNum: 0 trainingServicePlatform: hybrid hybridConfig: # TODO: Add more platforms trainingServicePlatforms: - remote - local machineList: - ip: passwd: port: username: remoteConfig: reuse: true adl: maxExecDuration: 15m nniManagerIp: # use a small trial number to make IT faster maxTrialNum: 2 trialConcurrency: 2 trial: namespace: default command: codeDir: gpuNum: 1 cpuNum: 1 image: memorySize: 1Gi checkpoint: storageClass: storageSize: trainingServicePlatform: adl aml: nniManagerIp: maxExecDuration: 15m # PAI has job submission limitation, set maxTrialNum=1 to control trial job numbers for PAI maxTrialNum: 2 trialConcurrency: 2 trainingServicePlatform: aml trial: gpuNum: 1 image: amlConfig: subscriptionId: resourceGroup: workspaceName: computeTarget: