"testing/vscode:/vscode.git/clone" did not exist on "310fea95048c0514ff4d4771af0ff094c96e0108"
Unverified Commit 704b60e3 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Add portList config in PAI trainingService (#1467)

parent 957c6783
......@@ -55,6 +55,32 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod
* Optional key. Set the shmMB configuration of OpenPAI, it set the shared memory for one task in the task role.
* authFile
* Optional key, Set the auth file path for private registry while using PAI mode, [Refer](https://github.com/microsoft/pai/blob/2ea69b45faa018662bc164ed7733f6fdbb4c42b3/docs/faq.md#q-how-to-use-private-docker-registry-job-image-when-submitting-an-openpai-job), you can prepare the authFile and simply provide the local path of this file, NNI will upload this file to HDFS for you.
* portList
* Optional key. Set the portList configuration of OpenPAI, it specifies a list of port used in container, [Refer](https://github.com/microsoft/pai/blob/b2324866d0280a2d22958717ea6025740f71b9f0/docs/job_tutorial.md#specification).
The config schema in NNI is shown below:
```
portList:
- label: test
beginAt: 8080
portNumber: 2
```
Let's say you want to launch a tensorboard in the mnist example using the port. So the first step is to write a wrapper script `launch_pai.sh` of `mnist.py`.
```bash
export TENSORBOARD_PORT=PAI_PORT_LIST_${PAI_CURRENT_TASK_ROLE_NAME}_0_tensorboard
tensorboard --logdir . --port ${!TENSORBOARD_PORT} &
python3 mnist.py
```
The config file of portList should be filled as following:
```yaml
trial:
command: bash launch_pai.sh
portList:
- label: tensorboard
beginAt: 0
portNumber: 1
```
Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command
```
......
......@@ -33,4 +33,4 @@ abstract class TrainingService {
}
```
The parent class of TrainingService has a few abstract functions, users need to inherit the parent class and implement all of these abstract functions.
For more information about how to write your own TrainingService, please [refer](https://github.com/SparkSnail/nni/blob/dev-trainingServiceDoc/docs/en_US/TrainingService/HowToImplementTrainingService.md).
For more information about how to write your own TrainingService, please [refer](https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/HowToImplementTrainingService.md).
......@@ -53,6 +53,11 @@ export namespace ValidationSchemas {
shmMB: joi.number(),
authFile: joi.string(),
nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
portList: joi.array().items(joi.object({
label: joi.string().required(),
beginAt: joi.number().required(),
portNumber: joi.number().required(),
})),
worker: joi.object({
replicas: joi.number().min(1).required(),
image: joi.string().min(1),
......
......@@ -39,6 +39,8 @@ export class PAITaskRole {
public readonly command: string;
//Shared memory for one task in the task role
public readonly shmMB?: number;
//portList to specify the port used in container
public portList?: portListMetaData[];
/**
* Constructor
......@@ -50,7 +52,7 @@ export class PAITaskRole {
* @param command Executable command for tasks in the task role, can not be empty
*/
constructor(name : string, taskNumber : number, cpuNumber : number, memoryMB : number, gpuNumber : number,
command : string, shmMB?: number) {
command : string, shmMB?: number, portList?: portListMetaData[]) {
this.name = name;
this.taskNumber = taskNumber;
this.cpuNumber = cpuNumber;
......@@ -58,6 +60,7 @@ export class PAITaskRole {
this.gpuNumber = gpuNumber;
this.command = command;
this.shmMB = shmMB;
this.portList = portList;
}
}
......@@ -120,6 +123,16 @@ export class PAIClusterConfig {
}
}
/**
* portList data structure used in PAI taskRole
*/
export class portListMetaData {
public readonly label : string = '';
public readonly beginAt: number = 0;
public readonly portNumber: number = 0;
}
/**
* PAI trial configuration
*/
......@@ -134,9 +147,11 @@ export class NNIPAITrialConfig extends TrialConfig {
public shmMB?: number;
//authentication file used for private Docker registry
public authFile?: string;
//portList to specify the port used in container
public portList?: portListMetaData[];
constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number,
image: string, virtualCluster?: string, shmMB?: number, authFile?: string) {
image: string, virtualCluster?: string, shmMB?: number, authFile?: string, portList?: portListMetaData[]) {
super(command, codeDir, gpuNum);
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
......@@ -144,5 +159,6 @@ export class NNIPAITrialConfig extends TrialConfig {
this.virtualCluster = virtualCluster;
this.shmMB = shmMB;
this.authFile = authFile;
this.portList = portList;
}
}
......@@ -79,6 +79,7 @@ class PAITrainingService implements TrainingService {
private logCollection: string;
private isMultiPhase: boolean = false;
private authFileHdfsPath: string | undefined = undefined;
private portList?: string | undefined;
constructor() {
this.log = getLogger();
......@@ -446,6 +447,8 @@ class PAITrainingService implements TrainingService {
nniPaiTrialCommand,
// Task shared memory
this.paiTrialConfig.shmMB,
// Task portList
this.paiTrialConfig.portList
)
];
......
......@@ -240,7 +240,12 @@ pai_trial_schema = {
Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\
error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'),
Optional('virtualCluster'): setType('virtualCluster', str),
Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode')
Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
Optional('portList'): [{
"label": setType('label', str),
"beginAt": setType('beginAt', int),
"portNumber": setType('portNumber', int)
}]
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment