Commit 101b02ff authored by Yan Ni's avatar Yan Ni Committed by fishyds
Browse files

mac support with local, remote & pai mode (#386)

* update Makefile for mac support, wait for aka.ms support

* refix Makefile for colorful echo

* update Makefile with shorturl

* fix false fail on mac webui

* fix cross os remote tmpdir issue

* add readonly to RemoteMachineTrainingService.remoteOS

* fix var name for PR 386
parent 694bb539
...@@ -3,30 +3,45 @@ ...@@ -3,30 +3,45 @@
PIP_INSTALL := python3 -m pip install PIP_INSTALL := python3 -m pip install
PIP_UNINSTALL := python3 -m pip uninstall PIP_UNINSTALL := python3 -m pip uninstall
## Colorful output # detect OS
_INFO := $(shell echo -e '\e[1;36m') UNAME_S := $(shell uname -s)
_WARNING := $(shell echo -e '\e[1;33m') ifeq ($(UNAME_S), Linux)
_END := $(shell echo -e '\e[0m') OS_SPEC := linux
## Colorful output
_INFO := $(shell echo -e '\e[1;36m')
_WARNING := $(shell echo -e '\e[1;33m')
_END := $(shell echo -e '\e[0m')
else ifeq ($(UNAME_S), Darwin)
OS_SPEC := darwin
else
$(error platform $(UNAME_S) not supported)
endif
## Install directories ## Install directories
ifeq ($(shell id -u), 0) # is root ifeq ($(shell id -u), 0) # is root
_ROOT := 1 _ROOT := 1
ROOT_FOLDER ?= $(shell python3 -c 'import site; from pathlib import Path; print(Path(site.getsitepackages()[0]).parents[2])') ROOT_FOLDER ?= $(shell python3 -c 'import site; from pathlib import Path; print(Path(site.getsitepackages()[0]).parents[2])')
BASH_COMP_SCRIPT ?= /usr/share/bash-completion/completions/nnictl BASH_COMP_PREFIX ?= /usr/share/bash-completion/completions
else # is normal user else # is normal user
ROOT_FOLDER ?= $(shell python3 -c 'import site; from pathlib import Path; print(Path(site.getusersitepackages()).parents[2])') ROOT_FOLDER ?= $(shell python3 -c 'import site; from pathlib import Path; print(Path(site.getusersitepackages()).parents[2])')
ifndef VIRTUAL_ENV ifndef VIRTUAL_ENV
PIP_MODE ?= --user PIP_MODE ?= --user
endif endif
BASH_COMP_SCRIPT ?= ${HOME}/.bash_completion.d/nnictl BASH_COMP_PREFIX ?= ${HOME}/.bash_completion.d
endif endif
BASH_COMP_SCRIPT := $(BASH_COMP_PREFIX)/nnictl
NNI_INSTALL_PATH ?= $(INSTALL_PREFIX)/nni
NNI_TMP_PATH ?= /tmp
BIN_FOLDER ?= $(ROOT_FOLDER)/bin BIN_FOLDER ?= $(ROOT_FOLDER)/bin
NNI_PKG_FOLDER ?= $(ROOT_FOLDER)/nni NNI_PKG_FOLDER ?= $(ROOT_FOLDER)/nni
## Dependency information ## Dependency information
NNI_NODE_TARBALL ?= /tmp/nni-node-linux-x64.tar.xz NNI_NODE_TARBALL ?= /tmp/nni-node-$(OS_SPEC)-x64.tar.xz
NNI_NODE_FOLDER = /tmp/nni-node-linux-x64 NNI_NODE_FOLDER = /tmp/nni-node-$(OS_SPEC)-x64
NNI_NODE ?= $(BIN_FOLDER)/node NNI_NODE ?= $(BIN_FOLDER)/node
NNI_YARN_TARBALL ?= /tmp/nni-yarn.tar.gz NNI_YARN_TARBALL ?= /tmp/nni-yarn.tar.gz
NNI_YARN_FOLDER ?= /tmp/nni-yarn NNI_YARN_FOLDER ?= /tmp/nni-yarn
...@@ -120,7 +135,7 @@ clean: ...@@ -120,7 +135,7 @@ clean:
$(NNI_NODE_TARBALL): $(NNI_NODE_TARBALL):
#$(_INFO) Downloading Node.js $(_END) #$(_INFO) Downloading Node.js $(_END)
wget https://aka.ms/nodejs-download -O $(NNI_NODE_TARBALL) wget https://aka.ms/nni/nodejs-download/$(OS_SPEC) -O $(NNI_NODE_TARBALL)
$(NNI_YARN_TARBALL): $(NNI_YARN_TARBALL):
#$(_INFO) Downloading Yarn $(_END) #$(_INFO) Downloading Yarn $(_END)
...@@ -176,7 +191,8 @@ dev-install-node-modules: ...@@ -176,7 +191,8 @@ dev-install-node-modules:
.PHONY: install-scripts .PHONY: install-scripts
install-scripts: install-scripts:
install -Dm644 tools/bash-completion $(BASH_COMP_SCRIPT) mkdir -p $(BASH_COMP_PREFIX)
install -m644 tools/bash-completion $(BASH_COMP_SCRIPT)
.PHONY: update-bash-config .PHONY: update-bash-config
ifndef _ROOT ifndef _ROOT
......
...@@ -272,6 +272,14 @@ function getIPV4Address(): string { ...@@ -272,6 +272,14 @@ function getIPV4Address(): string {
throw Error('getIPV4Address() failed because no valid IPv4 address found.') throw Error('getIPV4Address() failed because no valid IPv4 address found.')
} }
function getRemoteTmpDir(osType: string): string {
if (osType == 'linux') {
return '/tmp';
} else {
throw Error(`remote OS ${osType} not supported`);
}
}
/** /**
* Get the status of canceled jobs according to the hint isEarlyStopped * Get the status of canceled jobs according to the hint isEarlyStopped
*/ */
...@@ -279,5 +287,5 @@ function getJobCancelStatus(isEarlyStopped: boolean): TrialJobStatus { ...@@ -279,5 +287,5 @@ function getJobCancelStatus(isEarlyStopped: boolean): TrialJobStatus {
return isEarlyStopped ? 'EARLY_STOPPED' : 'USER_CANCELED'; return isEarlyStopped ? 'EARLY_STOPPED' : 'USER_CANCELED';
} }
export { generateParamFileName, getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getJobCancelStatus, export {getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getJobCancelStatus,
getDefaultDatabaseDir, getIPV4Address, mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect }; getDefaultDatabaseDir, getIPV4Address, mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect };
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
"version": "1.0.0", "version": "1.0.0",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"postbuild": "cp -f --parent scripts/*.py ./dist/", "postbuild": "cp -rf scripts ./dist/",
"build": "tsc", "build": "tsc",
"test": "mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --colors", "test": "mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --colors",
"start": "node dist/main.js" "start": "node dist/main.js"
......
...@@ -169,7 +169,7 @@ class LocalTrainingService implements TrainingService { ...@@ -169,7 +169,7 @@ class LocalTrainingService implements TrainingService {
this.setTrialJobStatus(trialJob, 'FAILED'); this.setTrialJobStatus(trialJob, 'FAILED');
try { try {
const state: string = await fs.promises.readFile(path.join(trialJob.workingDirectory, '.nni', 'state'), 'utf8'); const state: string = await fs.promises.readFile(path.join(trialJob.workingDirectory, '.nni', 'state'), 'utf8');
const match: RegExpMatchArray | null = state.trim().match(/^(\d+)\s+(\d+)$/); const match: RegExpMatchArray | null = state.trim().match(/^(\d+)\s+(\d+)/);
if (match !== null) { if (match !== null) {
const { 1: code, 2: timestamp } = match; const { 1: code, 2: timestamp } = match;
if (parseInt(code, 10) === 0) { if (parseInt(code, 10) === 0) {
......
...@@ -36,7 +36,7 @@ import { ObservableTimer } from '../../common/observableTimer'; ...@@ -36,7 +36,7 @@ import { ObservableTimer } from '../../common/observableTimer';
import { import {
HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric
} from '../../common/trainingService'; } from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString, getJobCancelStatus } from '../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString, getJobCancelStatus, getRemoteTmpDir } from '../../common/utils';
import { GPUSummary } from '../common/gpuData'; import { GPUSummary } from '../common/gpuData';
import { TrialConfig } from '../common/trialConfig'; import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
...@@ -66,8 +66,10 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -66,8 +66,10 @@ class RemoteMachineTrainingService implements TrainingService {
private log: Logger; private log: Logger;
private isMultiPhase: boolean = false; private isMultiPhase: boolean = false;
private trialSequenceId: number; private trialSequenceId: number;
private readonly remoteOS: string;
constructor(@component.Inject timer: ObservableTimer) { constructor(@component.Inject timer: ObservableTimer) {
this.remoteOS = 'linux';
this.metricsEmitter = new EventEmitter(); this.metricsEmitter = new EventEmitter();
this.trialJobsMap = new Map<string, RemoteMachineTrialJobDetail>(); this.trialJobsMap = new Map<string, RemoteMachineTrialJobDetail>();
this.machineSSHClientMap = new Map<RemoteMachineMeta, Client>(); this.machineSSHClientMap = new Map<RemoteMachineMeta, Client>();
...@@ -372,7 +374,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -372,7 +374,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Copy NNI scripts to remote expeirment working directory // Copy NNI scripts to remote expeirment working directory
const remoteScriptsDir: string = this.getRemoteScriptsPath(); const remoteScriptsDir: string = this.getRemoteScriptsPath();
await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteScriptsDir}`, conn); await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteScriptsDir}`, conn);
await SSHClientUtility.copyDirectoryToRemote('./scripts', remoteScriptsDir, conn); await SSHClientUtility.copyDirectoryToRemote('./scripts', remoteScriptsDir, conn, this.remoteOS);
await SSHClientUtility.remoteExeCommand(`chmod 777 ${nniRootDir} ${nniRootDir}/* ${nniRootDir}/scripts/*`, conn); await SSHClientUtility.remoteExeCommand(`chmod 777 ${nniRootDir} ${nniRootDir}/* ${nniRootDir}/scripts/*`, conn);
//Begin to execute gpu_metrics_collection scripts //Begin to execute gpu_metrics_collection scripts
...@@ -485,7 +487,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -485,7 +487,7 @@ class RemoteMachineTrainingService implements TrainingService {
await this.writeParameterFile(trialJobId, form.hyperParameters, rmScheduleInfo.rmMeta); await this.writeParameterFile(trialJobId, form.hyperParameters, rmScheduleInfo.rmMeta);
// Copy files in codeDir to remote working directory // Copy files in codeDir to remote working directory
await SSHClientUtility.copyDirectoryToRemote(this.trialConfig.codeDir, trialWorkingFolder, sshClient); await SSHClientUtility.copyDirectoryToRemote(this.trialConfig.codeDir, trialWorkingFolder, sshClient, this.remoteOS);
// Execute command in remote machine // Execute command in remote machine
SSHClientUtility.remoteExeCommand(`bash ${path.join(trialWorkingFolder, 'run.sh')}`, sshClient); SSHClientUtility.remoteExeCommand(`bash ${path.join(trialWorkingFolder, 'run.sh')}`, sshClient);
} }
...@@ -576,7 +578,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -576,7 +578,7 @@ class RemoteMachineTrainingService implements TrainingService {
} }
private getRemoteExperimentRootDir(): string{ private getRemoteExperimentRootDir(): string{
return path.join(os.tmpdir(), 'nni', 'experiments', getExperimentId()); return path.join(getRemoteTmpDir(this.remoteOS), 'nni', 'experiments', getExperimentId());
} }
private getJobPidPath(jobId: string): string { private getJobPidPath(jobId: string): string {
......
...@@ -28,7 +28,7 @@ import * as stream from 'stream'; ...@@ -28,7 +28,7 @@ import * as stream from 'stream';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { NNIError, NNIErrorNames } from '../../common/errors'; import { NNIError, NNIErrorNames } from '../../common/errors';
import { getLogger } from '../../common/log'; import { getLogger } from '../../common/log';
import { uniqueString } from '../../common/utils'; import { uniqueString, getRemoteTmpDir } from '../../common/utils';
import { RemoteCommandResult } from './remoteMachineData'; import { RemoteCommandResult } from './remoteMachineData';
/** /**
...@@ -43,11 +43,11 @@ export namespace SSHClientUtility { ...@@ -43,11 +43,11 @@ export namespace SSHClientUtility {
* @param remoteDirectory remote directory * @param remoteDirectory remote directory
* @param sshClient SSH client * @param sshClient SSH client
*/ */
export async function copyDirectoryToRemote(localDirectory : string, remoteDirectory : string, sshClient : Client) : Promise<void> { export async function copyDirectoryToRemote(localDirectory : string, remoteDirectory : string, sshClient : Client, remoteOS: string) : Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
const tmpTarName: string = `${uniqueString(10)}.tar.gz`; const tmpTarName: string = `${uniqueString(10)}.tar.gz`;
const localTarPath: string = path.join(os.tmpdir(), tmpTarName); const localTarPath: string = path.join(os.tmpdir(), tmpTarName);
const remoteTarPath: string = path.join(os.tmpdir(), tmpTarName); const remoteTarPath: string = path.join(getRemoteTmpDir(remoteOS), tmpTarName);
// Compress files in local directory to experiment root directory // Compress files in local directory to experiment root directory
await cpp.exec(`tar -czf ${localTarPath} -C ${localDirectory} .`); await cpp.exec(`tar -czf ${localTarPath} -C ${localDirectory} .`);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment