Add preCommand option to support configuring experimental environment by user (#2875)

f892ed67 · J-shang · GitHub · 36042435 · f892ed67 · f892ed67
Unverified Commit f892ed67 authored Sep 21, 2020 by J-shang Committed by GitHub Sep 21, 2020
12 changed files
--- a/docs/en_US/TrainingService/RemoteMachineMode.md
+++ b/docs/en_US/TrainingService/RemoteMachineMode.md
@@ -107,3 +107,79 @@ Files in `codeDir` will be uploaded to remote machines automatically. You can ru
 ```bash
 nnictl create --config examples/trials/mnist-annotation/config_remote.yml
 ```
+### Configure python environment
+By default, commands and scripts will be executed in the default environment in remote machine. If there are multiple python virtual environments in your remote machine, and you want to run experiments in a specific environment, then use __preCommand__ to specify a python environment on your remote machine. 
+Use `examples/trials/mnist-tfv2` as the example. Below is content of `examples/trials/mnist-tfv2/config_remote.yml`:
+```yaml
+authorName: default
+experimentName: example_mnist
+trialConcurrency: 1
+maxExecDuration: 1h
+maxTrialNum: 10
+#choice: local, remote, pai
+trainingServicePlatform: remote
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner
+  #SMAC (SMAC should be installed through nnictl)
+  builtinTunerName: TPE
+  classArgs:
+    #choice: maximize, minimize
+    optimize_mode: maximize
+trial:
+  command: python3 mnist.py
+  codeDir: .
+  gpuNum: 0
+#machineList can be empty if the platform is local
+machineList:
+  - ip: ${replace_to_your_remote_machine_ip}
+    username: ${replace_to_your_remote_machine_username}
+    sshKeyPath: ${replace_to_your_remote_machine_sshKeyPath}
+    # Pre-command will be executed before the remote machine executes other commands.
+    # Below is an example of specifying python environment.
+    # If you want to execute multiple commands, please use "&&" to connect them.
+    # preCommand: source ${replace_to_absolute_path_recommended_here}/bin/activate
+    # preCommand: source ${replace_to_conda_path}/bin/activate ${replace_to_conda_env_name}
+    preCommand: export PATH=${replace_to_python_environment_path_in_your_remote_machine}:$PATH
+```
+The __preCommand__ will be executed before the remote machine executes other commands. So you can configure python environment path like this:
+```yaml
+# Linux remote machine
+preCommand: export PATH=${replace_to_python_environment_path_in_your_remote_machine}:$PATH
+# Windows remote machine
+preCommand: set path=${replace_to_python_environment_path_in_your_remote_machine};%path%
+```
+Or if you want to activate the `virtualenv` environment:
+```yaml
+# Linux remote machine
+preCommand: source ${replace_to_absolute_path_recommended_here}/bin/activate
+# Windows remote machine
+preCommand: ${replace_to_absolute_path_recommended_here}\\scripts\\activate
+```
+Or if you want to activate the `conda` environment:
+```yaml
+# Linux remote machine
+preCommand: source ${replace_to_conda_path}/bin/activate ${replace_to_conda_env_name}
+# Windows remote machine
+preCommand: call activate ${replace_to_conda_env_name}
+```
+If you want multiple commands to be executed, you can use `&&` to connect these commands:
+```yaml
+preCommand: command1 && command2 && command3
+```
+__Note__: Because __preCommand__ will execute before other commands each time, it is strongly not recommended to set __preCommand__ that will make changes to system, i.e. `mkdir` or `touch`.
--- a/docs/en_US/Tutorial/ExperimentConfig.md
+++ b/docs/en_US/Tutorial/ExperimentConfig.md
@@ -58,6 +58,7 @@ This document describes the rules to write the config file, and provides some ex
      - [gpuIndices](#gpuindices-3)
      - [maxTrialNumPerGpu](#maxtrialnumpergpu-1)
      - [useActiveGpu](#useactivegpu-1)
+      - [preCommand](#preCommand)
    + [kubeflowConfig](#kubeflowconfig)
      - [operator](#operator)
      - [storage](#storage)
@@ -583,6 +584,14 @@ Optional. Bool. Default: false.
 Used to specify whether to use a GPU if there is another process. By default, NNI will use the GPU only if there is no other active process in the GPU. If __useActiveGpu__ is set to true, NNI will use the GPU regardless of another processes. This field is not applicable for NNI on Windows.
+#### preCommand
+Optional. String.
+Specifies the pre-command that will be executed before the remote machine executes other commands. Users can configure the experimental environment on remote machine by setting __preCommand__. If there are multiple commands need to execute, use `&&` to connect them, such as `preCommand: command1 && command2 && ...`.
+__Note__: Because __preCommand__ will execute before other commands each time, it is strongly not recommended to set __preCommand__ that will make changes to system, i.e. `mkdir` or `touch`.
 ### kubeflowConfig
 #### operator
@@ -795,6 +804,12 @@ If run trial jobs in remote machine, users could specify the remote machine info
      username: test
      sshKeyPath: /nni/sshkey
      passphrase: qwert
+      # Pre-command will be executed before the remote machine executes other commands.
+      # Below is an example of specifying python environment.
+      # If you want to execute multiple commands, please use "&&" to connect them.
+      # preCommand: source ${replace_to_absolute_path_recommended_here}/bin/activate
+      # preCommand: source ${replace_to_conda_path}/bin/activate ${replace_to_conda_env_name}
+      preCommand: export PATH=${replace_to_python_environment_path_in_your_remote_machine}:$PATH
  ```
 ### PAI mode

--- a/examples/trials/mnist-tfv2/config_remote.yml
+++ b/examples/trials/mnist-tfv2/config_remote.yml
+authorName: default
+experimentName: example_mnist
+trialConcurrency: 1
+maxExecDuration: 1h
+maxTrialNum: 10
+#choice: local, remote, pai
+trainingServicePlatform: remote
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner
+  #SMAC (SMAC should be installed through nnictl)
+  builtinTunerName: TPE
+  classArgs:
+    #choice: maximize, minimize
+    optimize_mode: maximize
+trial:
+  command: python3 mnist.py
+  codeDir: .
+  gpuNum: 0
+#machineList can be empty if the platform is local
+machineList:
+  - ip: ${replace_to_your_remote_machine_ip}
+    username: ${replace_to_your_remote_machine_username}
+    sshKeyPath: ${replace_to_your_remote_machine_sshKeyPath}
+    # Pre-command will be executed before the remote machine executes other commands.
+    # Below is an example of specifying python environment.
+    # If you want to execute multiple commands, please use "&&" to connect them.
+    # preCommand: source ${replace_to_absolute_path_recommended_here}/bin/activate
+    # preCommand: source ${replace_to_conda_path}/bin/activate ${replace_to_conda_env_name}
+    preCommand: export PATH=${replace_to_python_environment_path_in_your_remote_machine}:$PATH
--- a/src/nni_manager/rest_server/restValidationSchemas.ts
+++ b/src/nni_manager/rest_server/restValidationSchemas.ts
@@ -17,7 +17,8 @@ export namespace ValidationSchemas {
                passphrase: joi.string(),
                gpuIndices: joi.string(),
                maxTrialNumPerGpu: joi.number(),
-                useActiveGpu: joi.boolean()
+                useActiveGpu: joi.boolean(),
+                preCommand: joi.string()
            })),
            local_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
                gpuIndices: joi.string(),

--- a/src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
+++ b/src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
@@ -123,11 +123,19 @@ class LinuxCommands extends OsCommands {
        if (isFile) {
            command = `bash '${script}'`;
        } else {
-            script = script.replace('"', '\\"');
+            script = script.replace(/"/g, '\\"');
            command = `bash -c "${script}"`;
        }
        return command;
    }
+    public addPreCommand(preCommand: string | undefined, command: string | undefined): string | undefined{
+        if (command === undefined || command === '' || preCommand === undefined || preCommand === ''){
+            return command;
+        } else {
+            return `${preCommand} && ${command}`;
+        }
+    }
 }
 export { LinuxCommands };
--- a/src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
+++ b/src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
@@ -46,7 +46,7 @@ class WindowsCommands extends OsCommands {
    }
    public generateGpuStatsScript(scriptFolder: string): string {
-        return `powershell -command $env:METRIC_OUTPUT_DIR='${scriptFolder}';$app = Start-Process -FilePath python -NoNewWindow -passthru -ArgumentList '-m nni_gpu_tool.gpu_metrics_collector' -RedirectStandardOutput ${scriptFolder}\\scriptstdout -RedirectStandardError ${scriptFolder}\\scriptstderr;Write $PID ^| Out-File ${scriptFolder}\\pid -NoNewline -encoding utf8;wait-process $app.ID`;
+        return `powershell -command $env:Path=If($env:prePath){$env:prePath}Else{$env:Path};$env:METRIC_OUTPUT_DIR='${scriptFolder}';$app = Start-Process -FilePath python -NoNewWindow -passthru -ArgumentList '-m nni_gpu_tool.gpu_metrics_collector' -RedirectStandardOutput ${scriptFolder}\\scriptstdout -RedirectStandardError ${scriptFolder}\\scriptstderr;Write $PID ^| Out-File ${scriptFolder}\\pid -NoNewline -encoding utf8;wait-process $app.ID`;
    }
    public createFolder(folderName: string, sharedFolder: boolean = false): string {
@@ -122,6 +122,14 @@ class WindowsCommands extends OsCommands {
        const command = `${script}`;
        return command;
    }
+    public addPreCommand(preCommand: string | undefined, command: string | undefined): string | undefined{
+        if (command === undefined || command === '' || preCommand === undefined || preCommand === ''){
+            return command;
+        } else {
+            return `${preCommand} && set prePath=%path% && ${command}`;
+        }
+    }
 }
 export { WindowsCommands };
--- a/src/nni_manager/training_service/remote_machine/osCommands.ts
+++ b/src/nni_manager/training_service/remote_machine/osCommands.ts
@@ -28,6 +28,7 @@ abstract class OsCommands {
    public abstract killChildProcesses(pidFileName: string, killSelf: boolean): string;
    public abstract extractFile(tarFileName: string, targetFolder: string): string;
    public abstract executeScript(script: string, isFile: boolean): string;
+    public abstract addPreCommand(preCommand: string | undefined, command: string | undefined): string | undefined;
    public joinPath(...paths: string[]): string {
        let dir: string = paths.filter((path: any) => path !== '').join(this.pathSpliter);

--- a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts
+++ b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts
@@ -23,6 +23,7 @@ export class RemoteMachineMeta {
    //TODO: initialize varialbe in constructor
    public occupiedGpuIndexMap?: Map<number, number>;
    public readonly useActiveGpu?: boolean = false;
+    public readonly preCommand?: string;
 }
 /**

--- a/src/nni_manager/training_service/remote_machine/shellExecutor.ts
+++ b/src/nni_manager/training_service/remote_machine/shellExecutor.ts
@@ -32,6 +32,7 @@ class ShellExecutor {
    private tempPath: string = "";
    private isWindows: boolean = false;
    private channelDefaultOutputs: string[] = [];
+    private preCommand: string | undefined;
    constructor() {
        this.log = getLogger();
@@ -47,6 +48,7 @@ class ShellExecutor {
            username: rmMeta.username,
            tryKeyboard: true,
        };
+        this.preCommand = rmMeta.preCommand;
        this.name = `${rmMeta.username}@${rmMeta.ip}:${rmMeta.port}`;
        if (rmMeta.passwd !== undefined) {
            connectConfig.password = rmMeta.passwd;
@@ -349,6 +351,9 @@ class ShellExecutor {
        let exitCode: number;
        const commandIndex = randomInt(10000);
+        if(this.osCommands !== undefined){
+            command = this.osCommands.addPreCommand(this.preCommand, command);
+        }
        this.log.debug(`remoteExeCommand(${commandIndex}): [${command}]`);
        // Windows always uses shell, and it needs to disable to get it works.

--- a/src/nni_manager/training_service/remote_machine/test/shellExecutor.test.ts
+++ b/src/nni_manager/training_service/remote_machine/test/shellExecutor.test.ts
@@ -36,6 +36,7 @@ async function getRemoteFileContentLoop(executor: ShellExecutor): Promise<void>
 describe('ShellExecutor test', () => {
    let skip: boolean = false;
+    let isWindows: boolean;
    let rmMeta: any;
    try {
        rmMeta = JSON.parse(fs.readFileSync('../../.vscode/rminfo.json', 'utf8'));
@@ -86,4 +87,28 @@ describe('ShellExecutor test', () => {
        await getRemoteFileContentLoop(executor);
        await executor.close();
    });
+    it('Test preCommand-1', async () => {
+        if (skip) {
+            return;
+        }
+        const executor: ShellExecutor = new ShellExecutor();
+        await executor.initialize(rmMeta);
+        const result = await executor.executeScript("ver", false, false);
+        isWindows = result.exitCode == 0 && result.stdout.search("Windows") > -1;
+        await executor.close();
+    });
+    it('Test preCommand-2', async () => {
+        if (skip) {
+            return;
+        }
+        const executor: ShellExecutor = new ShellExecutor();
+        rmMeta.preCommand = isWindows ? "set TEST_PRE_COMMAND=test_pre_command" : "export TEST_PRE_COMMAND=test_pre_command";
+        await executor.initialize(rmMeta);
+        const command = isWindows ? "python -c \"import os; print(os.environ.get(\'TEST_PRE_COMMAND\'))\"" : "python3 -c \"import os; print(os.environ.get(\'TEST_PRE_COMMAND\'))\"";
+        const result = (await executor.executeScript(command, false, false)).stdout.replace(/[\ +\r\n]/g, "");
+        chai.expect(result).eq("test_pre_command");
+        await executor.close();
+    });
 });
--- a/src/nni_manager/training_service/test/remoteMachineTrainingService.test.ts
+++ b/src/nni_manager/training_service/test/remoteMachineTrainingService.test.ts
@@ -25,8 +25,8 @@ describe('Unit Test for RemoteMachineTrainingService', () => {
    Default/.vscode/rminfo.json,  whose content looks like:
    {
        "ip": "10.172.121.40",
-        "user": "user1",
+        "username": "user1",
-        "password": "mypassword"
+        "passwd": "mypassword"
    }
    */
    let skip: boolean = false;

--- a/tools/nni_cmd/config_schema.py
+++ b/tools/nni_cmd/config_schema.py
@@ -382,7 +382,8 @@ machine_list_schema = {
            Optional('passphrase'): setType('passphrase', str),
            Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
            Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int),
-            Optional('useActiveGpu'): setType('useActiveGpu', bool)
+            Optional('useActiveGpu'): setType('useActiveGpu', bool),
+            Optional('preCommand'): setType('preCommand', str)
        },
        {
            'ip': setType('ip', str),
@@ -391,7 +392,8 @@ machine_list_schema = {
            'passwd': setType('passwd', str),
            Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
            Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int),
-            Optional('useActiveGpu'): setType('useActiveGpu', bool)
+            Optional('useActiveGpu'): setType('useActiveGpu', bool),
+            Optional('preCommand'): setType('preCommand', str)
        })]
 }