Unverified Commit 1c6f1efa authored by Chi Song's avatar Chi Song Committed by GitHub
Browse files

fix #1578 and some improvements (#2370)

Add shell support for ssh connection, so that remote script can be started with user environment.

Minor fixes,

1. Fix gpu_metrics_collector to support pyenv. As pyenv will create one more process, so that original pgrep code always got extra processes, and cannot start gpu_metrics_collector.
2. Fix NASUI failure on dev-install-node-modules, to create subfolder every time.
3. Fix MakeFile to reduce mis-created links, and other minor issues.
4. Add node --watch for nni_manager for better dev experience.
parent 1d893dda
......@@ -173,12 +173,12 @@ install-python-modules:
dev-install-python-modules:
#$(_INFO) Installing Python SDK $(_END)
mkdir -p build
ln -sf ../src/sdk/pynni/nni build/nni
ln -sf ../src/sdk/pynni/nnicli build/nnicli
ln -sf ../tools/nni_annotation build/nni_annotation
ln -sf ../tools/nni_cmd build/nni_cmd
ln -sf ../tools/nni_trial_tool build/nni_trial_tool
ln -sf ../tools/nni_gpu_tool build/nni_gpu_tool
ln -sf ../src/sdk/pynni/nni build
ln -sf ../src/sdk/pycli/nnicli build
ln -sf ../tools/nni_annotation build
ln -sf ../tools/nni_cmd build
ln -sf ../tools/nni_trial_tool build
ln -sf ../tools/nni_gpu_tool build
cp setup.py build/
cp README.md build/
sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' build/setup.py
......@@ -209,10 +209,12 @@ dev-install-node-modules:
ln -sf ${PWD}/src/nni_manager/dist $(NNI_PKG_FOLDER)
cp src/nni_manager/package.json $(NNI_PKG_FOLDER)
sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(NNI_PKG_FOLDER)/package.json
ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)/node_modules
ln -sf ${PWD}/src/webui/build $(NNI_PKG_FOLDER)/static
ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)/build
ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)/server.js
ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)
ln -sf ${PWD}/src/webui/build -t $(NNI_PKG_FOLDER)
mv $(NNI_PKG_FOLDER)/build $(NNI_PKG_FOLDER)/static
mkdir -p $(NASUI_PKG_FOLDER)
ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)
ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)
.PHONY: install-scripts
install-scripts:
......
......@@ -6,7 +6,7 @@
For debugging NNI source code, your development environment should be under Ubuntu 16.04 (or above) system with python 3 and pip 3 installed, then follow the below steps.
**1. Clone the source code**
### 1. Clone the source code
Run the command
......@@ -16,7 +16,7 @@ git clone https://github.com/Microsoft/nni.git
to clone the source code
**2. Prepare the debug environment and install dependencies**
### 2. Prepare the debug environment and install dependencies**
Change directory to the source code folder, then run the command
......@@ -26,7 +26,7 @@ make install-dependencies
to install the dependent tools for the environment
**3. Build source code**
### 3. Build source code
Run the command
......@@ -36,7 +36,7 @@ make build
to build the source code
**4. Install NNI to development environment**
### 4. Install NNI to development environment
Run the command
......@@ -46,7 +46,7 @@ make dev-install
to install the distribution content to development environment, and create cli scripts
**5. Check if the environment is ready**
### 5. Check if the environment is ready
Now, you can try to start an experiment to check if your environment is ready.
For example, run the command
......@@ -57,9 +57,21 @@ nnictl create --config ~/nni/examples/trials/mnist-tfv1/config.yml
And open WebUI to check if everything is OK
**6. Redeploy**
### 6. Redeploy
After the code changes, it may need to redeploy. It depends on what kind of code changed.
#### Python
It doesn't need to redeploy, but the nnictl may need to be restarted.
#### TypeScript
* If `src/nni_manager` will be changed, run `yarn watch` continually under this folder. It will rebuild code instantly.
* If `src/webui` or `src/nasui` is changed, use **step 3** to rebuild code.
The nnictl may need to be restarted.
After the code changes, use **step 3** to rebuild your codes, then the changes will take effect immediately.
---
At last, wish you have a wonderful day.
......
......@@ -6,6 +6,7 @@
"build": "tsc",
"test": "nyc mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --exclude node_modules/**/**/*.test.ts --colors",
"start": "node dist/main.js",
"watch": "tsc --watch",
"eslint": "npx eslint ./ --ext .ts"
},
"license": "MIT",
......
......@@ -442,7 +442,7 @@ class RemoteMachineTrainingService implements TrainingService {
//Begin to execute gpu_metrics_collection scripts
const script = getGpuMetricsCollectorBashScriptContent(remoteGpuScriptCollectorDir);
SSHClientUtility.remoteExeCommand(`bash -c '${script}'`, conn);
SSHClientUtility.remoteExeCommand(`bash -c '${script}'`, conn, true);
const disposable: Rx.IDisposable = this.timer.subscribe(
async (tick: number) => {
......@@ -588,7 +588,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Copy files in codeDir to remote working directory
await SSHClientUtility.copyDirectoryToRemote(trialLocalTempFolder, trialWorkingFolder, sshClient, this.remoteOS);
// Execute command in remote machine
SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient);
SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient, true);
}
private getRmMetaByHost(host: string): RemoteMachineMeta {
......
......@@ -58,7 +58,7 @@ export namespace SSHClientUtility {
* @param command the command to execute remotely
* @param client SSH Client
*/
export function remoteExeCommand(command: string, client: Client): Promise<RemoteCommandResult> {
export function remoteExeCommand(command: string, client: Client, useShell: boolean = false): Promise<RemoteCommandResult> {
const log: Logger = getLogger();
log.debug(`remoteExeCommand: command: [${command}]`);
const deferred: Deferred<RemoteCommandResult> = new Deferred<RemoteCommandResult>();
......@@ -66,31 +66,43 @@ export namespace SSHClientUtility {
let stderr: string = '';
let exitCode: number;
client.exec(command, (err: Error, channel: ClientChannel) => {
const callback = (err: Error, channel: ClientChannel): void => {
if (err !== undefined && err !== null) {
log.error(`remoteExeCommand: ${err.message}`);
deferred.reject(err);
return;
}
channel.on('data', (data: any, dataStderr: any) => {
if (dataStderr !== undefined && dataStderr !== null) {
stderr += data.toString();
} else {
stdout += data.toString();
}
})
.on('exit', (code: any, signal: any) => {
channel.on('data', (data: any) => {
stdout += data;
});
channel.on('exit', (code: any) => {
exitCode = <number>code;
log.debug(`remoteExeCommand exit(${exitCode})\nstdout: ${stdout}\nstderr: ${stderr}`);
deferred.resolve({
stdout : stdout,
stderr : stderr,
exitCode : exitCode
stdout: stdout,
stderr: stderr,
exitCode: exitCode
});
});
channel.stderr.on('data', function (data) {
stderr += data;
});
if (useShell) {
channel.stdin.write(`${command}\n`);
channel.end("exit\n");
}
return;
};
if (useShell) {
client.shell(callback);
} else {
client.exec(command, callback);
}
return deferred.promise;
}
......
......@@ -5593,7 +5593,7 @@ load-json-file@^4.0.0:
pify "^3.0.0"
strip-bom "^3.0.0"
loader-fs-cache@>=1.0.3, loader-fs-cache@^1.0.0:
loader-fs-cache@^1.0.0:
version "1.0.3"
resolved "https://registry.yarnpkg.com/loader-fs-cache/-/loader-fs-cache-1.0.3.tgz#f08657646d607078be2f0a032f8bd69dd6f277d9"
integrity sha512-ldcgZpjNJj71n+2Mf6yetz+c9bM4xpKtNds4LbqXzU/PTdeAX0g3ytnU1AJMEcTk2Lex4Smpe3Q/eCTsvUBxbA==
......
......@@ -10,6 +10,7 @@ import traceback
from xml.dom import minidom
def check_ready_to_run():
if sys.platform == 'win32':
pgrep_output = subprocess.check_output(
......@@ -20,17 +21,20 @@ def check_ready_to_run():
pidList.remove(os.getpid())
return not pidList
else:
pgrep_output = subprocess.check_output('pgrep -fxu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pidList.append(int(pid))
pidList.remove(os.getpid())
pid = pid.decode()
if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()):
continue
pidList.append(pid)
return not pidList
def main(argv):
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
if check_ready_to_run() == False:
# GPU metrics collector is already running. Exit
print("GPU metrics collector is already running. exiting...")
exit(2)
cmd = 'nvidia-smi -q -x'.split()
while(True):
......@@ -44,6 +48,7 @@ def main(argv):
# TODO: change to sleep time configurable via arguments
time.sleep(5)
def parse_nvidia_smi_result(smi, outputDir):
try:
old_umask = os.umask(0)
......@@ -70,13 +75,14 @@ def parse_nvidia_smi_result(smi, outputDir):
outPut["gpuInfos"].append(gpuInfo)
print(outPut)
outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
outputFile.flush();
except:
outputFile.flush()
except Exception as error:
# e_info = sys.exc_info()
print('xmldoc paring error')
print('gpu_metrics_collector error: %s' % error)
finally:
os.umask(old_umask)
def gen_empty_gpu_metric(outputDir):
try:
old_umask = os.umask(0)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment