Unverified Commit 01d609a2 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Support setting shmMB in pai config (#847)

parent 05c8c985
......@@ -43,7 +43,7 @@ paiConfig:
Note: You should set `trainingServicePlatform: pai` in NNI config YAML file if you want to start experiment in pai mode.
Compared with LocalMode and [RemoteMachineMode](RemoteMachineMode.md), trial configuration in pai mode have five additional keys:
Compared with LocalMode and [RemoteMachineMode](RemoteMachineMode.md), trial configuration in pai mode have these additional keys:
* cpuNum
* Required key. Should be positive number based on your trial program's CPU requirement
* memoryMB
......@@ -55,6 +55,10 @@ Compared with LocalMode and [RemoteMachineMode](RemoteMachineMode.md), trial con
* Optional key. It specifies the HDFS data direcotry for trial to download data. The format should be something like hdfs://{your HDFS host}:9000/{your data directory}
* outputDir
* Optional key. It specifies the HDFS output directory for trial. Once the trial is completed (either succeed or fail), trial's stdout, stderr will be copied to this directory by NNI sdk automatically. The format should be something like hdfs://{your HDFS host}:9000/{your output directory}
* virturlCluster
* Optional key. Set the virtualCluster of PAI. If omitted, the job will run on default virtual cluster.
* shmMB
* Optional key. Set the shmMB configuration of PAI, it set the shared memory for one task in the task role.
Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command
```
......
......@@ -42,6 +42,7 @@ export namespace ValidationSchemas {
gpuNum: joi.number().min(0),
command: joi.string().min(1),
virtualCluster: joi.string(),
shmMB: joi.number(),
worker: joi.object({
replicas: joi.number().min(1).required(),
image: joi.string().min(1),
......@@ -76,6 +77,7 @@ export namespace ValidationSchemas {
outputDir: joi.string(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
shmMB: joi.number(),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required(),
frameworkAttemptCompletionPolicy: joi.object({
......
......@@ -34,6 +34,8 @@ export class PAITaskRole {
public readonly gpuNumber: number;
// Executable command for tasks in the task role, can not be empty
public readonly command: string;
//Shared memory for one task in the task role
public readonly shmMB?: number;
/**
* Constructor
......@@ -44,13 +46,14 @@ export class PAITaskRole {
* @param gpuNumber GPU number for one task in the task role, no less than 0
* @param command Executable command for tasks in the task role, can not be empty
*/
constructor(name : string, taskNumber : number, cpuNumber : number, memoryMB : number, gpuNumber : number, command : string) {
constructor(name : string, taskNumber : number, cpuNumber : number, memoryMB : number, gpuNumber : number, command : string, shmMB?: number) {
this.name = name;
this.taskNumber = taskNumber;
this.cpuNumber = cpuNumber;
this.memoryMB = memoryMB;
this.gpuNumber = gpuNumber;
this.command = command;
this.shmMB = shmMB;
}
}
......@@ -119,9 +122,11 @@ export class NNIPAITrialConfig extends TrialConfig{
//The virtual cluster job runs on. If omitted, the job will run on default virtual cluster
public virtualCluster?: string;
//Shared memory for one task in the task role
public shmMB?: number;
constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number,
image: string, dataDir: string, outputDir: string, virtualCluster?: string) {
image: string, dataDir: string, outputDir: string, virtualCluster?: string, shmMB?: number) {
super(command, codeDir, gpuNum);
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
......@@ -129,6 +134,7 @@ export class NNIPAITrialConfig extends TrialConfig{
this.dataDir = dataDir;
this.outputDir = outputDir;
this.virtualCluster = virtualCluster;
this.shmMB = shmMB;
}
}
......@@ -242,7 +242,9 @@ class PAITrainingService implements TrainingService {
// Task GPU number
this.paiTrialConfig.gpuNum,
// Task command
nniPaiTrialCommand)];
nniPaiTrialCommand,
// Task shared memory
this.paiTrialConfig.shmMB)];
const paiJobConfig : PAIJobConfig = new PAIJobConfig(
// Job name
......
......@@ -130,6 +130,7 @@ pai_trial_schema = {
'cpuNum': And(int, lambda x: 0 <= x <= 99999),
'memoryMB': int,
'image': str,
Optional('shmMB'): int,
Optional('dataDir'): Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),
Optional('outputDir'): Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),
Optional('virtualCluster'): str
......
......@@ -215,10 +215,19 @@ def validate_machine_list(experiment_config):
print_error('Please set machineList!')
exit(1)
def validate_pai_trial_conifg(experiment_config):
'''validate the trial config in pai platform'''
if experiment_config.get('trainingServicePlatform') == 'pai':
if experiment_config.get('trial').get('shmMB') and \
experiment_config['trial']['shmMB'] > experiment_config['trial']['memoryMB']:
print_error('shmMB should be no more than memoryMB!')
exit(1)
def validate_all_content(experiment_config, config_path):
'''Validate whether experiment_config is valid'''
parse_path(experiment_config, config_path)
validate_common_content(experiment_config)
validate_pai_trial_conifg(experiment_config)
experiment_config['maxExecDuration'] = parse_time(experiment_config['maxExecDuration'])
if experiment_config.get('advisor'):
parse_advisor_content(experiment_config)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment