fix #1578 and some improvements (#2370)

Add shell support for ssh connection, so that remote script can be started with user environment. Minor fixes, 1. Fix gpu_metrics_collector to support pyenv. As pyenv will create one more process, so that original pgrep code always got extra processes, and cannot start gpu_metrics_collector. 2. Fix NASUI failure on dev-install-node-modules, to create subfolder every time. 3. Fix MakeFile to reduce mis-created links, and other minor issues. 4. Add node --watch for nni_manager for better dev experience.

fix #1578 and some improvements (#2370)
Add shell support for ssh connection, so that remote script can be started with user environment. Minor fixes, 1. Fix gpu_metrics_collector to support pyenv. As pyenv will create one more process, so that original pgrep code always got extra processes, and cannot start gpu_metrics_collector. 2. Fix NASUI failure on dev-install-node-modules, to create subfolder every time. 3. Fix MakeFile to reduce mis-created links, and other minor issues. 4. Add node --watch for nni_manager for better dev experience.
1c6f1efa · Chi Song · GitHub · 1d893dda · 1c6f1efa · 1c6f1efa
Unverified Commit 1c6f1efa authored Apr 26, 2020 by Chi Song Committed by GitHub Apr 26, 2020
7 changed files
--- a/Makefile
+++ b/Makefile
@@ -173,12 +173,12 @@ install-python-modules:
 dev-install-python-modules:
 	#$(_INFO) Installing Python SDK $(_END)
 	mkdir -p build
-	ln -sf ../src/sdk/pynni/nni build/nni
+	ln -sf ../src/sdk/pynni/nni build
-	ln -sf ../src/sdk/pynni/nnicli build/nnicli
+	ln -sf ../src/sdk/pycli/nnicli build
-	ln -sf ../tools/nni_annotation build/nni_annotation
+	ln -sf ../tools/nni_annotation build
-	ln -sf ../tools/nni_cmd build/nni_cmd
+	ln -sf ../tools/nni_cmd build
-	ln -sf ../tools/nni_trial_tool build/nni_trial_tool
+	ln -sf ../tools/nni_trial_tool build
-	ln -sf ../tools/nni_gpu_tool build/nni_gpu_tool
+	ln -sf ../tools/nni_gpu_tool build
 	cp setup.py build/
 	cp README.md build/
 	sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' build/setup.py
@@ -209,10 +209,12 @@ dev-install-node-modules:
 	ln -sf ${PWD}/src/nni_manager/dist $(NNI_PKG_FOLDER)
 	cp src/nni_manager/package.json $(NNI_PKG_FOLDER)
 	sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(NNI_PKG_FOLDER)/package.json
-	ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)/node_modules
+	ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)
-	ln -sf ${PWD}/src/webui/build $(NNI_PKG_FOLDER)/static
+	ln -sf ${PWD}/src/webui/build -t $(NNI_PKG_FOLDER)
-	ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)/build
+	mv $(NNI_PKG_FOLDER)/build $(NNI_PKG_FOLDER)/static
-	ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)/server.js
+	mkdir -p $(NASUI_PKG_FOLDER)
+	ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)
+	ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)
 .PHONY: install-scripts
 install-scripts:

--- a/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md
+++ b/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md
@@ -6,7 +6,7 @@
 For debugging NNI source code, your development environment should be under Ubuntu 16.04 (or above) system with python 3 and pip 3 installed, then follow the below steps.
-**1. Clone the source code**
+### 1. Clone the source code
 Run the command
@@ -16,7 +16,7 @@ git clone https://github.com/Microsoft/nni.git
 to clone the source code
-**2. Prepare the debug environment and install dependencies**
+### 2. Prepare the debug environment and install dependencies**
 Change directory to the source code folder, then run the command
@@ -26,7 +26,7 @@ make install-dependencies
 to install the dependent tools for the environment
-**3. Build source code**
+### 3. Build source code
 Run the command
@@ -36,7 +36,7 @@ make build
 to build the source code
-**4. Install NNI to development environment**
+### 4. Install NNI to development environment
 Run the command
@@ -46,7 +46,7 @@ make dev-install
 to install the distribution content to development environment, and create cli scripts
-**5. Check if the environment is ready**
+### 5. Check if the environment is ready
 Now, you can try to start an experiment to check if your environment is ready.
 For example, run the command
@@ -57,9 +57,21 @@ nnictl create --config ~/nni/examples/trials/mnist-tfv1/config.yml
 And open WebUI to check if everything is OK
-**6. Redeploy**
+### 6. Redeploy
+After the code changes, it may need to redeploy. It depends on what kind of code changed. 
+#### Python
+It doesn't need to redeploy, but the nnictl may need to be restarted.
+#### TypeScript
+* If `src/nni_manager` will be changed, run `yarn watch` continually under this folder. It will rebuild code instantly.
+* If `src/webui` or `src/nasui` is changed, use **step 3** to rebuild code.
+The nnictl may need to be restarted.
-After the code changes, use **step 3** to rebuild your codes, then the changes will take effect immediately.
 ---
 At last, wish you have a wonderful day.

--- a/src/nni_manager/package.json
+++ b/src/nni_manager/package.json
@@ -6,6 +6,7 @@
    "build": "tsc",
    "test": "nyc mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --exclude node_modules/**/**/*.test.ts --colors",
    "start": "node dist/main.js",
+    "watch": "tsc --watch",
    "eslint": "npx eslint ./ --ext .ts"
  },
  "license": "MIT",

--- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
+++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
@@ -442,7 +442,7 @@ class RemoteMachineTrainingService implements TrainingService {
        //Begin to execute gpu_metrics_collection scripts
        const script = getGpuMetricsCollectorBashScriptContent(remoteGpuScriptCollectorDir);
-        SSHClientUtility.remoteExeCommand(`bash -c '${script}'`, conn);
+        SSHClientUtility.remoteExeCommand(`bash -c '${script}'`, conn, true);
        const disposable: Rx.IDisposable = this.timer.subscribe(
            async (tick: number) => {
@@ -588,7 +588,7 @@ class RemoteMachineTrainingService implements TrainingService {
        // Copy files in codeDir to remote working directory
        await SSHClientUtility.copyDirectoryToRemote(trialLocalTempFolder, trialWorkingFolder, sshClient, this.remoteOS);
        // Execute command in remote machine
-        SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient);
+        SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient, true);
    }
    private getRmMetaByHost(host: string): RemoteMachineMeta {

--- a/src/nni_manager/training_service/remote_machine/sshClientUtility.ts
+++ b/src/nni_manager/training_service/remote_machine/sshClientUtility.ts
@@ -58,7 +58,7 @@ export namespace SSHClientUtility {
     * @param command the command to execute remotely
     * @param client SSH Client
     */
-    export function remoteExeCommand(command: string, client: Client): Promise<RemoteCommandResult> {
+    export function remoteExeCommand(command: string, client: Client, useShell: boolean = false): Promise<RemoteCommandResult> {
        const log: Logger = getLogger();
        log.debug(`remoteExeCommand: command: [${command}]`);
        const deferred: Deferred<RemoteCommandResult> = new Deferred<RemoteCommandResult>();
@@ -66,31 +66,43 @@ export namespace SSHClientUtility {
        let stderr: string = '';
        let exitCode: number;
-        client.exec(command, (err: Error, channel: ClientChannel) => {
+        const callback = (err: Error, channel: ClientChannel): void => {
            if (err !== undefined && err !== null) {
                log.error(`remoteExeCommand: ${err.message}`);
                deferred.reject(err);
                return;
            }
-            channel.on('data', (data: any, dataStderr: any) => {
+            channel.on('data', (data: any) => {
-                if (dataStderr !== undefined && dataStderr !== null) {
+                stdout += data;
-                    stderr += data.toString();
+            });
-                } else {
+            channel.on('exit', (code: any) => {
-                    stdout += data.toString();
-                }
-            })
-              .on('exit', (code: any, signal: any) => {
                exitCode = <number>code;
+                log.debug(`remoteExeCommand exit(${exitCode})\nstdout: ${stdout}\nstderr: ${stderr}`);
                deferred.resolve({
-                    stdout : stdout,
+                    stdout: stdout,
-                    stderr : stderr,
+                    stderr: stderr,
-                    exitCode : exitCode
+                    exitCode: exitCode
                });
            });
+            channel.stderr.on('data', function (data) {
+                stderr += data;
            });
+            if (useShell) {
+                channel.stdin.write(`${command}\n`);
+                channel.end("exit\n");
+            }
+            return;
+        };
+        if (useShell) {
+            client.shell(callback);
+        } else {
+            client.exec(command, callback);
+        }
        return deferred.promise;
    }

--- a/src/webui/yarn.lock
+++ b/src/webui/yarn.lock
@@ -5593,7 +5593,7 @@ load-json-file@^4.0.0:
    pify "^3.0.0"
    strip-bom "^3.0.0"
-loader-fs-cache@>=1.0.3, loader-fs-cache@^1.0.0:
+loader-fs-cache@^1.0.0:
  version "1.0.3"
  resolved "https://registry.yarnpkg.com/loader-fs-cache/-/loader-fs-cache-1.0.3.tgz#f08657646d607078be2f0a032f8bd69dd6f277d9"
  integrity sha512-ldcgZpjNJj71n+2Mf6yetz+c9bM4xpKtNds4LbqXzU/PTdeAX0g3ytnU1AJMEcTk2Lex4Smpe3Q/eCTsvUBxbA==

--- a/tools/nni_gpu_tool/gpu_metrics_collector.py
+++ b/tools/nni_gpu_tool/gpu_metrics_collector.py
@@ -10,6 +10,7 @@ import traceback
 from xml.dom import minidom
 def check_ready_to_run():
    if sys.platform == 'win32':
        pgrep_output = subprocess.check_output(
@@ -20,17 +21,20 @@ def check_ready_to_run():
        pidList.remove(os.getpid())
        return not pidList
    else:
-        pgrep_output = subprocess.check_output('pgrep -fxu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
+        pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
        pidList = []
        for pid in pgrep_output.splitlines():
-            pidList.append(int(pid))
+            pid = pid.decode()
-        pidList.remove(os.getpid())
+            if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()):
+                continue
+            pidList.append(pid)
        return not pidList
 def main(argv):
    metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
    if check_ready_to_run() == False:
-        # GPU metrics collector is already running. Exit
+        print("GPU metrics collector is already running. exiting...")
        exit(2)
    cmd = 'nvidia-smi -q -x'.split()
    while(True):
@@ -44,6 +48,7 @@ def main(argv):
        # TODO: change to sleep time configurable via arguments
        time.sleep(5)
 def parse_nvidia_smi_result(smi, outputDir):
    try:
        old_umask = os.umask(0)
@@ -70,13 +75,14 @@ def parse_nvidia_smi_result(smi, outputDir):
                outPut["gpuInfos"].append(gpuInfo)
            print(outPut)
            outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
-            outputFile.flush();
+            outputFile.flush()
-    except:
+    except Exception as error:
        # e_info = sys.exc_info()
-        print('xmldoc paring error')
+        print('gpu_metrics_collector error: %s' % error)
    finally:
        os.umask(old_umask)
 def gen_empty_gpu_metric(outputDir):
    try:
        old_umask = os.umask(0)